Gradience-T1-7B-checkpoint / trainer_state.json
qingy2024's picture
Upload checkpoint 600
9b0bbc3 verified
raw
history blame
106 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24395202276885547,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004065867046147591,
"grad_norm": 0.22144322097301483,
"learning_rate": 0.0,
"loss": 1.3598,
"step": 1
},
{
"epoch": 0.0008131734092295182,
"grad_norm": 0.199473574757576,
"learning_rate": 4e-05,
"loss": 1.405,
"step": 2
},
{
"epoch": 0.0012197601138442774,
"grad_norm": 0.20758001506328583,
"learning_rate": 8e-05,
"loss": 1.2815,
"step": 3
},
{
"epoch": 0.0016263468184590363,
"grad_norm": 0.21362783014774323,
"learning_rate": 0.00012,
"loss": 1.245,
"step": 4
},
{
"epoch": 0.0020329335230737954,
"grad_norm": 0.24631692469120026,
"learning_rate": 0.00016,
"loss": 1.3086,
"step": 5
},
{
"epoch": 0.002439520227688555,
"grad_norm": 0.20009225606918335,
"learning_rate": 0.0002,
"loss": 1.2443,
"step": 6
},
{
"epoch": 0.0028461069323033137,
"grad_norm": 0.1735246330499649,
"learning_rate": 0.00019995929167514756,
"loss": 1.1878,
"step": 7
},
{
"epoch": 0.0032526936369180726,
"grad_norm": 0.18904437124729156,
"learning_rate": 0.00019991858335029514,
"loss": 1.2478,
"step": 8
},
{
"epoch": 0.003659280341532832,
"grad_norm": 0.1645248979330063,
"learning_rate": 0.0001998778750254427,
"loss": 1.2098,
"step": 9
},
{
"epoch": 0.004065867046147591,
"grad_norm": 0.22034819424152374,
"learning_rate": 0.00019983716670059028,
"loss": 1.1183,
"step": 10
},
{
"epoch": 0.00447245375076235,
"grad_norm": 0.3233634829521179,
"learning_rate": 0.00019979645837573783,
"loss": 1.0974,
"step": 11
},
{
"epoch": 0.00487904045537711,
"grad_norm": 0.2592090368270874,
"learning_rate": 0.00019975575005088542,
"loss": 1.1611,
"step": 12
},
{
"epoch": 0.005285627159991868,
"grad_norm": 0.14754348993301392,
"learning_rate": 0.000199715041726033,
"loss": 1.1932,
"step": 13
},
{
"epoch": 0.005692213864606627,
"grad_norm": 0.09341374039649963,
"learning_rate": 0.00019967433340118055,
"loss": 1.348,
"step": 14
},
{
"epoch": 0.006098800569221387,
"grad_norm": 0.10229193419218063,
"learning_rate": 0.00019963362507632813,
"loss": 1.0927,
"step": 15
},
{
"epoch": 0.006505387273836145,
"grad_norm": 0.14015386998653412,
"learning_rate": 0.00019959291675147569,
"loss": 1.2263,
"step": 16
},
{
"epoch": 0.006911973978450905,
"grad_norm": 0.17507047951221466,
"learning_rate": 0.00019955220842662327,
"loss": 1.1951,
"step": 17
},
{
"epoch": 0.007318560683065664,
"grad_norm": 0.17176274955272675,
"learning_rate": 0.00019951150010177082,
"loss": 1.1895,
"step": 18
},
{
"epoch": 0.007725147387680423,
"grad_norm": 0.13839803636074066,
"learning_rate": 0.00019947079177691838,
"loss": 0.9549,
"step": 19
},
{
"epoch": 0.008131734092295182,
"grad_norm": 0.0970696285367012,
"learning_rate": 0.00019943008345206596,
"loss": 1.0867,
"step": 20
},
{
"epoch": 0.008538320796909941,
"grad_norm": 0.08836886286735535,
"learning_rate": 0.0001993893751272135,
"loss": 1.155,
"step": 21
},
{
"epoch": 0.0089449075015247,
"grad_norm": 0.11885025352239609,
"learning_rate": 0.0001993486668023611,
"loss": 1.1231,
"step": 22
},
{
"epoch": 0.00935149420613946,
"grad_norm": 0.15120816230773926,
"learning_rate": 0.00019930795847750865,
"loss": 1.1078,
"step": 23
},
{
"epoch": 0.00975808091075422,
"grad_norm": 0.16326424479484558,
"learning_rate": 0.00019926725015265623,
"loss": 1.079,
"step": 24
},
{
"epoch": 0.010164667615368977,
"grad_norm": 0.1179085448384285,
"learning_rate": 0.0001992265418278038,
"loss": 0.932,
"step": 25
},
{
"epoch": 0.010571254319983736,
"grad_norm": 0.10621985793113708,
"learning_rate": 0.00019918583350295136,
"loss": 1.1386,
"step": 26
},
{
"epoch": 0.010977841024598495,
"grad_norm": 0.08408638089895248,
"learning_rate": 0.00019914512517809894,
"loss": 1.0987,
"step": 27
},
{
"epoch": 0.011384427729213255,
"grad_norm": 0.08222135156393051,
"learning_rate": 0.0001991044168532465,
"loss": 1.0378,
"step": 28
},
{
"epoch": 0.011791014433828014,
"grad_norm": 0.08763129264116287,
"learning_rate": 0.00019906370852839408,
"loss": 0.983,
"step": 29
},
{
"epoch": 0.012197601138442773,
"grad_norm": 0.10638878494501114,
"learning_rate": 0.00019902300020354163,
"loss": 1.0258,
"step": 30
},
{
"epoch": 0.012604187843057533,
"grad_norm": 0.10155023634433746,
"learning_rate": 0.0001989822918786892,
"loss": 0.9579,
"step": 31
},
{
"epoch": 0.01301077454767229,
"grad_norm": 0.08844579011201859,
"learning_rate": 0.00019894158355383677,
"loss": 1.1007,
"step": 32
},
{
"epoch": 0.01341736125228705,
"grad_norm": 0.10394158959388733,
"learning_rate": 0.00019890087522898432,
"loss": 1.0459,
"step": 33
},
{
"epoch": 0.01382394795690181,
"grad_norm": 0.08938682824373245,
"learning_rate": 0.0001988601669041319,
"loss": 1.0985,
"step": 34
},
{
"epoch": 0.014230534661516568,
"grad_norm": 0.08639086782932281,
"learning_rate": 0.00019881945857927948,
"loss": 1.0712,
"step": 35
},
{
"epoch": 0.014637121366131328,
"grad_norm": 0.08568435162305832,
"learning_rate": 0.00019877875025442704,
"loss": 1.0549,
"step": 36
},
{
"epoch": 0.015043708070746087,
"grad_norm": 0.0859316885471344,
"learning_rate": 0.00019873804192957462,
"loss": 1.1042,
"step": 37
},
{
"epoch": 0.015450294775360847,
"grad_norm": 0.09534381330013275,
"learning_rate": 0.00019869733360472217,
"loss": 1.0127,
"step": 38
},
{
"epoch": 0.015856881479975604,
"grad_norm": 0.09103580564260483,
"learning_rate": 0.00019865662527986976,
"loss": 0.9347,
"step": 39
},
{
"epoch": 0.016263468184590364,
"grad_norm": 0.0928095132112503,
"learning_rate": 0.0001986159169550173,
"loss": 1.0559,
"step": 40
},
{
"epoch": 0.016670054889205123,
"grad_norm": 0.09370871633291245,
"learning_rate": 0.0001985752086301649,
"loss": 1.1473,
"step": 41
},
{
"epoch": 0.017076641593819882,
"grad_norm": 0.07691123336553574,
"learning_rate": 0.00019853450030531244,
"loss": 1.0128,
"step": 42
},
{
"epoch": 0.01748322829843464,
"grad_norm": 0.09201047569513321,
"learning_rate": 0.00019849379198046,
"loss": 1.1296,
"step": 43
},
{
"epoch": 0.0178898150030494,
"grad_norm": 0.08490074425935745,
"learning_rate": 0.00019845308365560758,
"loss": 1.0444,
"step": 44
},
{
"epoch": 0.01829640170766416,
"grad_norm": 0.08623114228248596,
"learning_rate": 0.00019841237533075513,
"loss": 1.066,
"step": 45
},
{
"epoch": 0.01870298841227892,
"grad_norm": 0.09486474096775055,
"learning_rate": 0.00019837166700590271,
"loss": 1.0788,
"step": 46
},
{
"epoch": 0.01910957511689368,
"grad_norm": 0.08024484664201736,
"learning_rate": 0.0001983309586810503,
"loss": 1.0262,
"step": 47
},
{
"epoch": 0.01951616182150844,
"grad_norm": 0.09256327897310257,
"learning_rate": 0.00019829025035619785,
"loss": 1.107,
"step": 48
},
{
"epoch": 0.019922748526123194,
"grad_norm": 0.09877921640872955,
"learning_rate": 0.00019824954203134543,
"loss": 1.1731,
"step": 49
},
{
"epoch": 0.020329335230737954,
"grad_norm": 0.08699575811624527,
"learning_rate": 0.00019820883370649299,
"loss": 1.0809,
"step": 50
},
{
"epoch": 0.020735921935352713,
"grad_norm": 0.089649498462677,
"learning_rate": 0.00019816812538164057,
"loss": 1.1564,
"step": 51
},
{
"epoch": 0.021142508639967472,
"grad_norm": 0.08757214993238449,
"learning_rate": 0.00019812741705678812,
"loss": 1.0272,
"step": 52
},
{
"epoch": 0.02154909534458223,
"grad_norm": 0.08320939540863037,
"learning_rate": 0.0001980867087319357,
"loss": 0.9931,
"step": 53
},
{
"epoch": 0.02195568204919699,
"grad_norm": 0.08898070454597473,
"learning_rate": 0.00019804600040708326,
"loss": 0.9421,
"step": 54
},
{
"epoch": 0.02236226875381175,
"grad_norm": 0.08072236180305481,
"learning_rate": 0.0001980052920822308,
"loss": 1.0304,
"step": 55
},
{
"epoch": 0.02276885545842651,
"grad_norm": 0.09354112297296524,
"learning_rate": 0.0001979645837573784,
"loss": 1.1041,
"step": 56
},
{
"epoch": 0.02317544216304127,
"grad_norm": 0.09214304387569427,
"learning_rate": 0.00019792387543252595,
"loss": 1.0666,
"step": 57
},
{
"epoch": 0.02358202886765603,
"grad_norm": 0.08546210825443268,
"learning_rate": 0.00019788316710767353,
"loss": 1.0795,
"step": 58
},
{
"epoch": 0.023988615572270788,
"grad_norm": 0.09029046446084976,
"learning_rate": 0.0001978424587828211,
"loss": 1.199,
"step": 59
},
{
"epoch": 0.024395202276885547,
"grad_norm": 0.08200937509536743,
"learning_rate": 0.00019780175045796866,
"loss": 0.9853,
"step": 60
},
{
"epoch": 0.024801788981500306,
"grad_norm": 0.08928566426038742,
"learning_rate": 0.00019776104213311624,
"loss": 0.9948,
"step": 61
},
{
"epoch": 0.025208375686115066,
"grad_norm": 0.08067034929990768,
"learning_rate": 0.0001977203338082638,
"loss": 0.9824,
"step": 62
},
{
"epoch": 0.02561496239072982,
"grad_norm": 0.07509499788284302,
"learning_rate": 0.00019767962548341138,
"loss": 0.9166,
"step": 63
},
{
"epoch": 0.02602154909534458,
"grad_norm": 0.10127029567956924,
"learning_rate": 0.00019763891715855893,
"loss": 0.978,
"step": 64
},
{
"epoch": 0.02642813579995934,
"grad_norm": 0.08480218052864075,
"learning_rate": 0.0001975982088337065,
"loss": 1.0019,
"step": 65
},
{
"epoch": 0.0268347225045741,
"grad_norm": 0.0922696441411972,
"learning_rate": 0.00019755750050885407,
"loss": 1.0213,
"step": 66
},
{
"epoch": 0.02724130920918886,
"grad_norm": 0.0819278433918953,
"learning_rate": 0.00019751679218400162,
"loss": 0.9792,
"step": 67
},
{
"epoch": 0.02764789591380362,
"grad_norm": 0.09971120208501816,
"learning_rate": 0.0001974760838591492,
"loss": 0.9605,
"step": 68
},
{
"epoch": 0.028054482618418378,
"grad_norm": 0.09195531904697418,
"learning_rate": 0.00019743537553429676,
"loss": 1.1203,
"step": 69
},
{
"epoch": 0.028461069323033137,
"grad_norm": 0.09179981052875519,
"learning_rate": 0.00019739466720944434,
"loss": 1.0586,
"step": 70
},
{
"epoch": 0.028867656027647896,
"grad_norm": 0.0866156816482544,
"learning_rate": 0.00019735395888459192,
"loss": 1.0558,
"step": 71
},
{
"epoch": 0.029274242732262656,
"grad_norm": 0.09198956191539764,
"learning_rate": 0.00019731325055973947,
"loss": 1.117,
"step": 72
},
{
"epoch": 0.029680829436877415,
"grad_norm": 0.0912180244922638,
"learning_rate": 0.00019727254223488705,
"loss": 1.0235,
"step": 73
},
{
"epoch": 0.030087416141492174,
"grad_norm": 0.092186838388443,
"learning_rate": 0.0001972318339100346,
"loss": 1.0119,
"step": 74
},
{
"epoch": 0.030494002846106934,
"grad_norm": 0.091013602912426,
"learning_rate": 0.0001971911255851822,
"loss": 1.0523,
"step": 75
},
{
"epoch": 0.030900589550721693,
"grad_norm": 0.0932595282793045,
"learning_rate": 0.00019715041726032974,
"loss": 1.0471,
"step": 76
},
{
"epoch": 0.03130717625533645,
"grad_norm": 0.089345782995224,
"learning_rate": 0.0001971097089354773,
"loss": 1.0214,
"step": 77
},
{
"epoch": 0.03171376295995121,
"grad_norm": 0.09476006776094437,
"learning_rate": 0.00019706900061062488,
"loss": 0.9888,
"step": 78
},
{
"epoch": 0.03212034966456597,
"grad_norm": 0.09379832446575165,
"learning_rate": 0.00019702829228577243,
"loss": 1.1039,
"step": 79
},
{
"epoch": 0.03252693636918073,
"grad_norm": 0.10659569501876831,
"learning_rate": 0.00019698758396092001,
"loss": 1.1377,
"step": 80
},
{
"epoch": 0.03293352307379549,
"grad_norm": 0.09652398526668549,
"learning_rate": 0.0001969468756360676,
"loss": 1.0194,
"step": 81
},
{
"epoch": 0.033340109778410246,
"grad_norm": 0.08641666918992996,
"learning_rate": 0.00019690616731121515,
"loss": 1.0239,
"step": 82
},
{
"epoch": 0.03374669648302501,
"grad_norm": 0.0956072062253952,
"learning_rate": 0.00019686545898636273,
"loss": 1.032,
"step": 83
},
{
"epoch": 0.034153283187639764,
"grad_norm": 0.08402691036462784,
"learning_rate": 0.00019682475066151029,
"loss": 0.9802,
"step": 84
},
{
"epoch": 0.03455986989225452,
"grad_norm": 0.08827648311853409,
"learning_rate": 0.00019678404233665787,
"loss": 1.1805,
"step": 85
},
{
"epoch": 0.03496645659686928,
"grad_norm": 0.08757660537958145,
"learning_rate": 0.00019674333401180542,
"loss": 0.952,
"step": 86
},
{
"epoch": 0.03537304330148404,
"grad_norm": 0.09728538244962692,
"learning_rate": 0.000196702625686953,
"loss": 1.0875,
"step": 87
},
{
"epoch": 0.0357796300060988,
"grad_norm": 0.08561044931411743,
"learning_rate": 0.00019666191736210056,
"loss": 0.9818,
"step": 88
},
{
"epoch": 0.03618621671071356,
"grad_norm": 0.08389468491077423,
"learning_rate": 0.0001966212090372481,
"loss": 0.9962,
"step": 89
},
{
"epoch": 0.03659280341532832,
"grad_norm": 0.08847957849502563,
"learning_rate": 0.0001965805007123957,
"loss": 1.0138,
"step": 90
},
{
"epoch": 0.036999390119943076,
"grad_norm": 0.08515489101409912,
"learning_rate": 0.00019653979238754324,
"loss": 1.0119,
"step": 91
},
{
"epoch": 0.03740597682455784,
"grad_norm": 0.09340325742959976,
"learning_rate": 0.00019649908406269083,
"loss": 1.0635,
"step": 92
},
{
"epoch": 0.037812563529172595,
"grad_norm": 0.09383916854858398,
"learning_rate": 0.0001964583757378384,
"loss": 1.0999,
"step": 93
},
{
"epoch": 0.03821915023378736,
"grad_norm": 0.09956547617912292,
"learning_rate": 0.00019641766741298596,
"loss": 1.0186,
"step": 94
},
{
"epoch": 0.038625736938402114,
"grad_norm": 0.09809234738349915,
"learning_rate": 0.00019637695908813354,
"loss": 1.0641,
"step": 95
},
{
"epoch": 0.03903232364301688,
"grad_norm": 0.08520065993070602,
"learning_rate": 0.0001963362507632811,
"loss": 0.9255,
"step": 96
},
{
"epoch": 0.03943891034763163,
"grad_norm": 0.09007880836725235,
"learning_rate": 0.00019629554243842868,
"loss": 1.0963,
"step": 97
},
{
"epoch": 0.03984549705224639,
"grad_norm": 0.08900373429059982,
"learning_rate": 0.00019625483411357623,
"loss": 0.9908,
"step": 98
},
{
"epoch": 0.04025208375686115,
"grad_norm": 0.09613076597452164,
"learning_rate": 0.0001962141257887238,
"loss": 0.9729,
"step": 99
},
{
"epoch": 0.04065867046147591,
"grad_norm": 0.09987878054380417,
"learning_rate": 0.00019617341746387137,
"loss": 1.0554,
"step": 100
},
{
"epoch": 0.04106525716609067,
"grad_norm": 0.10209144651889801,
"learning_rate": 0.00019613270913901892,
"loss": 1.1162,
"step": 101
},
{
"epoch": 0.041471843870705426,
"grad_norm": 0.10085388273000717,
"learning_rate": 0.0001960920008141665,
"loss": 1.1355,
"step": 102
},
{
"epoch": 0.04187843057532019,
"grad_norm": 0.08966121822595596,
"learning_rate": 0.00019605129248931406,
"loss": 0.9275,
"step": 103
},
{
"epoch": 0.042285017279934944,
"grad_norm": 0.10507562756538391,
"learning_rate": 0.00019601058416446166,
"loss": 1.081,
"step": 104
},
{
"epoch": 0.04269160398454971,
"grad_norm": 0.09719648957252502,
"learning_rate": 0.00019596987583960922,
"loss": 1.0884,
"step": 105
},
{
"epoch": 0.04309819068916446,
"grad_norm": 0.09457529336214066,
"learning_rate": 0.00019592916751475677,
"loss": 1.0413,
"step": 106
},
{
"epoch": 0.043504777393779226,
"grad_norm": 0.11330179125070572,
"learning_rate": 0.00019588845918990435,
"loss": 1.0937,
"step": 107
},
{
"epoch": 0.04391136409839398,
"grad_norm": 0.09778840839862823,
"learning_rate": 0.0001958477508650519,
"loss": 1.1316,
"step": 108
},
{
"epoch": 0.044317950803008745,
"grad_norm": 0.09848835319280624,
"learning_rate": 0.0001958070425401995,
"loss": 1.1244,
"step": 109
},
{
"epoch": 0.0447245375076235,
"grad_norm": 0.0965428277850151,
"learning_rate": 0.00019576633421534704,
"loss": 0.9952,
"step": 110
},
{
"epoch": 0.045131124212238256,
"grad_norm": 0.0857444629073143,
"learning_rate": 0.00019572562589049462,
"loss": 0.9822,
"step": 111
},
{
"epoch": 0.04553771091685302,
"grad_norm": 0.10461942851543427,
"learning_rate": 0.00019568491756564218,
"loss": 1.1463,
"step": 112
},
{
"epoch": 0.045944297621467775,
"grad_norm": 0.08575154095888138,
"learning_rate": 0.00019564420924078973,
"loss": 0.8976,
"step": 113
},
{
"epoch": 0.04635088432608254,
"grad_norm": 0.0948256254196167,
"learning_rate": 0.00019560350091593731,
"loss": 1.1205,
"step": 114
},
{
"epoch": 0.046757471030697294,
"grad_norm": 0.09214090555906296,
"learning_rate": 0.00019556279259108487,
"loss": 1.1416,
"step": 115
},
{
"epoch": 0.04716405773531206,
"grad_norm": 0.09885852038860321,
"learning_rate": 0.00019552208426623248,
"loss": 1.079,
"step": 116
},
{
"epoch": 0.04757064443992681,
"grad_norm": 0.09071148931980133,
"learning_rate": 0.00019548137594138003,
"loss": 1.0128,
"step": 117
},
{
"epoch": 0.047977231144541575,
"grad_norm": 0.09190430492162704,
"learning_rate": 0.00019544066761652758,
"loss": 0.9631,
"step": 118
},
{
"epoch": 0.04838381784915633,
"grad_norm": 0.08024870604276657,
"learning_rate": 0.00019539995929167517,
"loss": 0.9086,
"step": 119
},
{
"epoch": 0.048790404553771094,
"grad_norm": 0.09223239868879318,
"learning_rate": 0.00019535925096682272,
"loss": 1.0255,
"step": 120
},
{
"epoch": 0.04919699125838585,
"grad_norm": 0.09259685128927231,
"learning_rate": 0.0001953185426419703,
"loss": 1.0221,
"step": 121
},
{
"epoch": 0.04960357796300061,
"grad_norm": 0.08371948450803757,
"learning_rate": 0.00019527783431711786,
"loss": 0.966,
"step": 122
},
{
"epoch": 0.05001016466761537,
"grad_norm": 0.0957912728190422,
"learning_rate": 0.00019523712599226544,
"loss": 1.0919,
"step": 123
},
{
"epoch": 0.05041675137223013,
"grad_norm": 0.09397678077220917,
"learning_rate": 0.000195196417667413,
"loss": 0.9666,
"step": 124
},
{
"epoch": 0.05082333807684489,
"grad_norm": 0.1014254167675972,
"learning_rate": 0.00019515570934256054,
"loss": 0.9321,
"step": 125
},
{
"epoch": 0.05122992478145964,
"grad_norm": 0.09339801222085953,
"learning_rate": 0.00019511500101770813,
"loss": 1.0487,
"step": 126
},
{
"epoch": 0.051636511486074406,
"grad_norm": 0.08642175793647766,
"learning_rate": 0.0001950742926928557,
"loss": 1.0606,
"step": 127
},
{
"epoch": 0.05204309819068916,
"grad_norm": 0.09092641621828079,
"learning_rate": 0.0001950335843680033,
"loss": 0.904,
"step": 128
},
{
"epoch": 0.052449684895303925,
"grad_norm": 0.09896791726350784,
"learning_rate": 0.00019499287604315084,
"loss": 1.0325,
"step": 129
},
{
"epoch": 0.05285627159991868,
"grad_norm": 0.08731307834386826,
"learning_rate": 0.0001949521677182984,
"loss": 0.9258,
"step": 130
},
{
"epoch": 0.05326285830453344,
"grad_norm": 0.09673330187797546,
"learning_rate": 0.00019491145939344598,
"loss": 1.1198,
"step": 131
},
{
"epoch": 0.0536694450091482,
"grad_norm": 0.09038975089788437,
"learning_rate": 0.00019487075106859353,
"loss": 1.0295,
"step": 132
},
{
"epoch": 0.05407603171376296,
"grad_norm": 0.0918399840593338,
"learning_rate": 0.0001948300427437411,
"loss": 1.0127,
"step": 133
},
{
"epoch": 0.05448261841837772,
"grad_norm": 0.08970967680215836,
"learning_rate": 0.00019478933441888867,
"loss": 1.0238,
"step": 134
},
{
"epoch": 0.05488920512299248,
"grad_norm": 0.09728217124938965,
"learning_rate": 0.00019474862609403625,
"loss": 1.069,
"step": 135
},
{
"epoch": 0.05529579182760724,
"grad_norm": 0.10240956395864487,
"learning_rate": 0.0001947079177691838,
"loss": 1.1467,
"step": 136
},
{
"epoch": 0.055702378532222,
"grad_norm": 0.10397852212190628,
"learning_rate": 0.00019466720944433136,
"loss": 1.0415,
"step": 137
},
{
"epoch": 0.056108965236836755,
"grad_norm": 0.10451675951480865,
"learning_rate": 0.00019462650111947894,
"loss": 1.0309,
"step": 138
},
{
"epoch": 0.05651555194145151,
"grad_norm": 0.09685720503330231,
"learning_rate": 0.00019458579279462652,
"loss": 1.11,
"step": 139
},
{
"epoch": 0.056922138646066274,
"grad_norm": 0.09885822236537933,
"learning_rate": 0.00019454508446977407,
"loss": 0.993,
"step": 140
},
{
"epoch": 0.05732872535068103,
"grad_norm": 0.10943586379289627,
"learning_rate": 0.00019450437614492165,
"loss": 0.9749,
"step": 141
},
{
"epoch": 0.05773531205529579,
"grad_norm": 0.10964591801166534,
"learning_rate": 0.0001944636678200692,
"loss": 1.1108,
"step": 142
},
{
"epoch": 0.05814189875991055,
"grad_norm": 0.10109028965234756,
"learning_rate": 0.0001944229594952168,
"loss": 1.0897,
"step": 143
},
{
"epoch": 0.05854848546452531,
"grad_norm": 0.11243695765733719,
"learning_rate": 0.00019438225117036434,
"loss": 1.0338,
"step": 144
},
{
"epoch": 0.05895507216914007,
"grad_norm": 0.1047658622264862,
"learning_rate": 0.00019434154284551192,
"loss": 0.9566,
"step": 145
},
{
"epoch": 0.05936165887375483,
"grad_norm": 0.09534204006195068,
"learning_rate": 0.00019430083452065948,
"loss": 1.0313,
"step": 146
},
{
"epoch": 0.059768245578369586,
"grad_norm": 0.10418044775724411,
"learning_rate": 0.00019426012619580706,
"loss": 0.9759,
"step": 147
},
{
"epoch": 0.06017483228298435,
"grad_norm": 0.10020595043897629,
"learning_rate": 0.00019421941787095461,
"loss": 0.9368,
"step": 148
},
{
"epoch": 0.060581418987599105,
"grad_norm": 0.09832129627466202,
"learning_rate": 0.00019417870954610217,
"loss": 1.0494,
"step": 149
},
{
"epoch": 0.06098800569221387,
"grad_norm": 0.09458506107330322,
"learning_rate": 0.00019413800122124978,
"loss": 0.9631,
"step": 150
},
{
"epoch": 0.06139459239682862,
"grad_norm": 0.10380101203918457,
"learning_rate": 0.00019409729289639733,
"loss": 1.1003,
"step": 151
},
{
"epoch": 0.061801179101443386,
"grad_norm": 0.107131227850914,
"learning_rate": 0.00019405658457154488,
"loss": 1.0819,
"step": 152
},
{
"epoch": 0.06220776580605814,
"grad_norm": 0.10330741852521896,
"learning_rate": 0.00019401587624669247,
"loss": 1.128,
"step": 153
},
{
"epoch": 0.0626143525106729,
"grad_norm": 0.08829359710216522,
"learning_rate": 0.00019397516792184002,
"loss": 0.8754,
"step": 154
},
{
"epoch": 0.06302093921528766,
"grad_norm": 0.10422427207231522,
"learning_rate": 0.0001939344595969876,
"loss": 0.9633,
"step": 155
},
{
"epoch": 0.06342752591990242,
"grad_norm": 0.11499015986919403,
"learning_rate": 0.00019389375127213515,
"loss": 0.9735,
"step": 156
},
{
"epoch": 0.06383411262451717,
"grad_norm": 0.0938427522778511,
"learning_rate": 0.00019385304294728274,
"loss": 0.9219,
"step": 157
},
{
"epoch": 0.06424069932913194,
"grad_norm": 0.1080261766910553,
"learning_rate": 0.0001938123346224303,
"loss": 0.9678,
"step": 158
},
{
"epoch": 0.0646472860337467,
"grad_norm": 0.10001271218061447,
"learning_rate": 0.00019377162629757784,
"loss": 1.0854,
"step": 159
},
{
"epoch": 0.06505387273836145,
"grad_norm": 0.10731212794780731,
"learning_rate": 0.00019373091797272543,
"loss": 1.0108,
"step": 160
},
{
"epoch": 0.06546045944297621,
"grad_norm": 0.10019373893737793,
"learning_rate": 0.00019369020964787298,
"loss": 1.0315,
"step": 161
},
{
"epoch": 0.06586704614759098,
"grad_norm": 0.0947297066450119,
"learning_rate": 0.0001936495013230206,
"loss": 1.0634,
"step": 162
},
{
"epoch": 0.06627363285220574,
"grad_norm": 0.12204254418611526,
"learning_rate": 0.00019360879299816814,
"loss": 1.0635,
"step": 163
},
{
"epoch": 0.06668021955682049,
"grad_norm": 0.10462553054094315,
"learning_rate": 0.0001935680846733157,
"loss": 1.0248,
"step": 164
},
{
"epoch": 0.06708680626143525,
"grad_norm": 0.09576130658388138,
"learning_rate": 0.00019352737634846328,
"loss": 0.9671,
"step": 165
},
{
"epoch": 0.06749339296605002,
"grad_norm": 0.10027123987674713,
"learning_rate": 0.00019348666802361083,
"loss": 0.9317,
"step": 166
},
{
"epoch": 0.06789997967066477,
"grad_norm": 0.10674256086349487,
"learning_rate": 0.0001934459596987584,
"loss": 1.0058,
"step": 167
},
{
"epoch": 0.06830656637527953,
"grad_norm": 0.12352320551872253,
"learning_rate": 0.00019340525137390597,
"loss": 1.0926,
"step": 168
},
{
"epoch": 0.06871315307989428,
"grad_norm": 0.09426864236593246,
"learning_rate": 0.00019336454304905355,
"loss": 1.0876,
"step": 169
},
{
"epoch": 0.06911973978450904,
"grad_norm": 0.09280996024608612,
"learning_rate": 0.0001933238347242011,
"loss": 0.977,
"step": 170
},
{
"epoch": 0.06952632648912381,
"grad_norm": 0.11547420918941498,
"learning_rate": 0.00019328312639934866,
"loss": 1.0598,
"step": 171
},
{
"epoch": 0.06993291319373857,
"grad_norm": 0.12538915872573853,
"learning_rate": 0.00019324241807449624,
"loss": 1.0996,
"step": 172
},
{
"epoch": 0.07033949989835332,
"grad_norm": 0.08110898733139038,
"learning_rate": 0.00019320170974964382,
"loss": 0.8776,
"step": 173
},
{
"epoch": 0.07074608660296808,
"grad_norm": 0.10475198924541473,
"learning_rate": 0.0001931610014247914,
"loss": 1.0876,
"step": 174
},
{
"epoch": 0.07115267330758285,
"grad_norm": 0.1095360517501831,
"learning_rate": 0.00019312029309993895,
"loss": 1.054,
"step": 175
},
{
"epoch": 0.0715592600121976,
"grad_norm": 0.09516473114490509,
"learning_rate": 0.0001930795847750865,
"loss": 1.0558,
"step": 176
},
{
"epoch": 0.07196584671681236,
"grad_norm": 0.09316466003656387,
"learning_rate": 0.0001930388764502341,
"loss": 0.9467,
"step": 177
},
{
"epoch": 0.07237243342142712,
"grad_norm": 0.11777061969041824,
"learning_rate": 0.00019299816812538164,
"loss": 1.1441,
"step": 178
},
{
"epoch": 0.07277902012604189,
"grad_norm": 0.09438811987638474,
"learning_rate": 0.00019295745980052922,
"loss": 0.9521,
"step": 179
},
{
"epoch": 0.07318560683065664,
"grad_norm": 0.08892639726400375,
"learning_rate": 0.00019291675147567678,
"loss": 0.9804,
"step": 180
},
{
"epoch": 0.0735921935352714,
"grad_norm": 0.08963356912136078,
"learning_rate": 0.00019287604315082436,
"loss": 1.0427,
"step": 181
},
{
"epoch": 0.07399878023988615,
"grad_norm": 0.09870661795139313,
"learning_rate": 0.0001928353348259719,
"loss": 1.051,
"step": 182
},
{
"epoch": 0.07440536694450091,
"grad_norm": 0.11843609809875488,
"learning_rate": 0.00019279462650111947,
"loss": 1.0109,
"step": 183
},
{
"epoch": 0.07481195364911568,
"grad_norm": 0.08860404789447784,
"learning_rate": 0.00019275391817626705,
"loss": 1.0035,
"step": 184
},
{
"epoch": 0.07521854035373043,
"grad_norm": 0.09085170924663544,
"learning_rate": 0.00019271320985141463,
"loss": 0.9461,
"step": 185
},
{
"epoch": 0.07562512705834519,
"grad_norm": 0.09071815758943558,
"learning_rate": 0.0001926725015265622,
"loss": 0.9542,
"step": 186
},
{
"epoch": 0.07603171376295995,
"grad_norm": 0.09566846489906311,
"learning_rate": 0.00019263179320170976,
"loss": 0.9958,
"step": 187
},
{
"epoch": 0.07643830046757472,
"grad_norm": 0.11846338212490082,
"learning_rate": 0.00019259108487685732,
"loss": 1.0737,
"step": 188
},
{
"epoch": 0.07684488717218947,
"grad_norm": 0.09295649081468582,
"learning_rate": 0.0001925503765520049,
"loss": 1.0162,
"step": 189
},
{
"epoch": 0.07725147387680423,
"grad_norm": 0.0917876660823822,
"learning_rate": 0.00019250966822715245,
"loss": 1.0432,
"step": 190
},
{
"epoch": 0.07765806058141898,
"grad_norm": 0.10864109545946121,
"learning_rate": 0.00019246895990230004,
"loss": 1.1107,
"step": 191
},
{
"epoch": 0.07806464728603375,
"grad_norm": 0.09689877927303314,
"learning_rate": 0.0001924282515774476,
"loss": 1.0421,
"step": 192
},
{
"epoch": 0.07847123399064851,
"grad_norm": 0.09406042098999023,
"learning_rate": 0.00019238754325259517,
"loss": 1.1042,
"step": 193
},
{
"epoch": 0.07887782069526326,
"grad_norm": 0.08346063643693924,
"learning_rate": 0.00019234683492774272,
"loss": 0.9554,
"step": 194
},
{
"epoch": 0.07928440739987802,
"grad_norm": 0.10317754745483398,
"learning_rate": 0.00019230612660289028,
"loss": 1.0835,
"step": 195
},
{
"epoch": 0.07969099410449278,
"grad_norm": 0.08712919056415558,
"learning_rate": 0.0001922654182780379,
"loss": 0.9799,
"step": 196
},
{
"epoch": 0.08009758080910755,
"grad_norm": 0.0860556811094284,
"learning_rate": 0.00019222470995318544,
"loss": 0.8661,
"step": 197
},
{
"epoch": 0.0805041675137223,
"grad_norm": 0.07940655201673508,
"learning_rate": 0.00019218400162833302,
"loss": 0.8305,
"step": 198
},
{
"epoch": 0.08091075421833706,
"grad_norm": 0.09200199693441391,
"learning_rate": 0.00019214329330348058,
"loss": 0.9774,
"step": 199
},
{
"epoch": 0.08131734092295181,
"grad_norm": 0.09980164468288422,
"learning_rate": 0.00019210258497862813,
"loss": 0.9791,
"step": 200
},
{
"epoch": 0.08172392762756658,
"grad_norm": 0.09660688042640686,
"learning_rate": 0.0001920618766537757,
"loss": 1.027,
"step": 201
},
{
"epoch": 0.08213051433218134,
"grad_norm": 0.09518909454345703,
"learning_rate": 0.00019202116832892327,
"loss": 0.9939,
"step": 202
},
{
"epoch": 0.0825371010367961,
"grad_norm": 0.0886114165186882,
"learning_rate": 0.00019198046000407085,
"loss": 0.985,
"step": 203
},
{
"epoch": 0.08294368774141085,
"grad_norm": 0.09820783883333206,
"learning_rate": 0.0001919397516792184,
"loss": 1.0064,
"step": 204
},
{
"epoch": 0.08335027444602562,
"grad_norm": 0.0957496389746666,
"learning_rate": 0.00019189904335436598,
"loss": 1.1126,
"step": 205
},
{
"epoch": 0.08375686115064038,
"grad_norm": 0.09990067780017853,
"learning_rate": 0.00019185833502951354,
"loss": 1.1517,
"step": 206
},
{
"epoch": 0.08416344785525513,
"grad_norm": 0.0953991562128067,
"learning_rate": 0.0001918176267046611,
"loss": 1.087,
"step": 207
},
{
"epoch": 0.08457003455986989,
"grad_norm": 0.10291532427072525,
"learning_rate": 0.0001917769183798087,
"loss": 1.0366,
"step": 208
},
{
"epoch": 0.08497662126448464,
"grad_norm": 0.09986121207475662,
"learning_rate": 0.00019173621005495625,
"loss": 0.9581,
"step": 209
},
{
"epoch": 0.08538320796909941,
"grad_norm": 0.09369988739490509,
"learning_rate": 0.00019169550173010383,
"loss": 1.0048,
"step": 210
},
{
"epoch": 0.08578979467371417,
"grad_norm": 0.0968063622713089,
"learning_rate": 0.0001916547934052514,
"loss": 1.0005,
"step": 211
},
{
"epoch": 0.08619638137832893,
"grad_norm": 0.11241315305233002,
"learning_rate": 0.00019161408508039894,
"loss": 1.0316,
"step": 212
},
{
"epoch": 0.08660296808294368,
"grad_norm": 0.09230878949165344,
"learning_rate": 0.00019157337675554652,
"loss": 0.917,
"step": 213
},
{
"epoch": 0.08700955478755845,
"grad_norm": 0.08461520820856094,
"learning_rate": 0.00019153266843069408,
"loss": 0.9144,
"step": 214
},
{
"epoch": 0.08741614149217321,
"grad_norm": 0.09011861681938171,
"learning_rate": 0.00019149196010584166,
"loss": 1.0092,
"step": 215
},
{
"epoch": 0.08782272819678796,
"grad_norm": 0.09200841188430786,
"learning_rate": 0.0001914512517809892,
"loss": 1.0552,
"step": 216
},
{
"epoch": 0.08822931490140272,
"grad_norm": 0.09052886068820953,
"learning_rate": 0.0001914105434561368,
"loss": 0.9067,
"step": 217
},
{
"epoch": 0.08863590160601749,
"grad_norm": 0.08740741014480591,
"learning_rate": 0.00019136983513128435,
"loss": 0.9182,
"step": 218
},
{
"epoch": 0.08904248831063225,
"grad_norm": 0.08494284749031067,
"learning_rate": 0.00019132912680643193,
"loss": 0.8321,
"step": 219
},
{
"epoch": 0.089449075015247,
"grad_norm": 0.0890796035528183,
"learning_rate": 0.0001912884184815795,
"loss": 0.9801,
"step": 220
},
{
"epoch": 0.08985566171986176,
"grad_norm": 0.094822458922863,
"learning_rate": 0.00019124771015672706,
"loss": 0.9779,
"step": 221
},
{
"epoch": 0.09026224842447651,
"grad_norm": 0.09756983071565628,
"learning_rate": 0.00019120700183187465,
"loss": 1.0385,
"step": 222
},
{
"epoch": 0.09066883512909128,
"grad_norm": 0.09434107691049576,
"learning_rate": 0.0001911662935070222,
"loss": 1.063,
"step": 223
},
{
"epoch": 0.09107542183370604,
"grad_norm": 0.0925639271736145,
"learning_rate": 0.00019112558518216975,
"loss": 0.9061,
"step": 224
},
{
"epoch": 0.0914820085383208,
"grad_norm": 0.10531201958656311,
"learning_rate": 0.00019108487685731734,
"loss": 1.1593,
"step": 225
},
{
"epoch": 0.09188859524293555,
"grad_norm": 0.08259832113981247,
"learning_rate": 0.0001910441685324649,
"loss": 0.8463,
"step": 226
},
{
"epoch": 0.09229518194755032,
"grad_norm": 431.5063171386719,
"learning_rate": 0.00019100346020761247,
"loss": 1.0632,
"step": 227
},
{
"epoch": 0.09270176865216508,
"grad_norm": 0.10764740407466888,
"learning_rate": 0.00019096275188276002,
"loss": 1.0083,
"step": 228
},
{
"epoch": 0.09310835535677983,
"grad_norm": 0.08872029185295105,
"learning_rate": 0.0001909220435579076,
"loss": 0.9301,
"step": 229
},
{
"epoch": 0.09351494206139459,
"grad_norm": 0.1006346270442009,
"learning_rate": 0.00019088133523305516,
"loss": 1.0103,
"step": 230
},
{
"epoch": 0.09392152876600936,
"grad_norm": 0.0970514565706253,
"learning_rate": 0.00019084062690820274,
"loss": 1.0522,
"step": 231
},
{
"epoch": 0.09432811547062411,
"grad_norm": 0.09807727485895157,
"learning_rate": 0.00019079991858335032,
"loss": 1.0498,
"step": 232
},
{
"epoch": 0.09473470217523887,
"grad_norm": 0.09828022867441177,
"learning_rate": 0.00019075921025849788,
"loss": 0.9871,
"step": 233
},
{
"epoch": 0.09514128887985362,
"grad_norm": 0.10089042782783508,
"learning_rate": 0.00019071850193364543,
"loss": 0.977,
"step": 234
},
{
"epoch": 0.0955478755844684,
"grad_norm": 0.09905245155096054,
"learning_rate": 0.000190677793608793,
"loss": 1.0135,
"step": 235
},
{
"epoch": 0.09595446228908315,
"grad_norm": 0.1002473533153534,
"learning_rate": 0.00019063708528394057,
"loss": 1.0219,
"step": 236
},
{
"epoch": 0.0963610489936979,
"grad_norm": 0.09028339385986328,
"learning_rate": 0.00019059637695908815,
"loss": 0.909,
"step": 237
},
{
"epoch": 0.09676763569831266,
"grad_norm": 0.0950377881526947,
"learning_rate": 0.0001905556686342357,
"loss": 0.9749,
"step": 238
},
{
"epoch": 0.09717422240292742,
"grad_norm": 0.09866049885749817,
"learning_rate": 0.00019051496030938328,
"loss": 1.0927,
"step": 239
},
{
"epoch": 0.09758080910754219,
"grad_norm": 0.09754758328199387,
"learning_rate": 0.00019047425198453084,
"loss": 1.059,
"step": 240
},
{
"epoch": 0.09798739581215694,
"grad_norm": 0.09261766821146011,
"learning_rate": 0.00019043354365967842,
"loss": 1.0912,
"step": 241
},
{
"epoch": 0.0983939825167717,
"grad_norm": 0.08637125045061111,
"learning_rate": 0.000190392835334826,
"loss": 0.8925,
"step": 242
},
{
"epoch": 0.09880056922138646,
"grad_norm": 0.0962812602519989,
"learning_rate": 0.00019035212700997355,
"loss": 1.0435,
"step": 243
},
{
"epoch": 0.09920715592600123,
"grad_norm": 0.09047430753707886,
"learning_rate": 0.00019031141868512113,
"loss": 1.0787,
"step": 244
},
{
"epoch": 0.09961374263061598,
"grad_norm": 0.09183438867330551,
"learning_rate": 0.0001902707103602687,
"loss": 0.9338,
"step": 245
},
{
"epoch": 0.10002032933523074,
"grad_norm": 0.09977632761001587,
"learning_rate": 0.00019023000203541624,
"loss": 1.1605,
"step": 246
},
{
"epoch": 0.10042691603984549,
"grad_norm": 0.10386580228805542,
"learning_rate": 0.00019018929371056382,
"loss": 1.0493,
"step": 247
},
{
"epoch": 0.10083350274446026,
"grad_norm": 0.09106533974409103,
"learning_rate": 0.00019014858538571138,
"loss": 0.9891,
"step": 248
},
{
"epoch": 0.10124008944907502,
"grad_norm": 0.09407884627580643,
"learning_rate": 0.00019010787706085896,
"loss": 1.0367,
"step": 249
},
{
"epoch": 0.10164667615368977,
"grad_norm": 0.10133463889360428,
"learning_rate": 0.0001900671687360065,
"loss": 1.0743,
"step": 250
},
{
"epoch": 0.10205326285830453,
"grad_norm": 0.11877205967903137,
"learning_rate": 0.0001900264604111541,
"loss": 1.1572,
"step": 251
},
{
"epoch": 0.10245984956291929,
"grad_norm": 0.10216309130191803,
"learning_rate": 0.00018998575208630165,
"loss": 1.0687,
"step": 252
},
{
"epoch": 0.10286643626753406,
"grad_norm": 0.09023922681808472,
"learning_rate": 0.0001899450437614492,
"loss": 0.9153,
"step": 253
},
{
"epoch": 0.10327302297214881,
"grad_norm": 0.09972742944955826,
"learning_rate": 0.0001899043354365968,
"loss": 0.9059,
"step": 254
},
{
"epoch": 0.10367960967676357,
"grad_norm": 0.1175752505660057,
"learning_rate": 0.00018986362711174436,
"loss": 1.0659,
"step": 255
},
{
"epoch": 0.10408619638137832,
"grad_norm": 0.09030337631702423,
"learning_rate": 0.00018982291878689195,
"loss": 0.9577,
"step": 256
},
{
"epoch": 0.1044927830859931,
"grad_norm": 0.08850797265768051,
"learning_rate": 0.0001897822104620395,
"loss": 0.9193,
"step": 257
},
{
"epoch": 0.10489936979060785,
"grad_norm": 1767.7669677734375,
"learning_rate": 0.00018974150213718705,
"loss": 0.9977,
"step": 258
},
{
"epoch": 0.1053059564952226,
"grad_norm": 0.11435185372829437,
"learning_rate": 0.00018970079381233463,
"loss": 1.0468,
"step": 259
},
{
"epoch": 0.10571254319983736,
"grad_norm": 0.10342080891132355,
"learning_rate": 0.0001896600854874822,
"loss": 1.0119,
"step": 260
},
{
"epoch": 0.10611912990445213,
"grad_norm": 0.11568263173103333,
"learning_rate": 0.00018961937716262977,
"loss": 1.025,
"step": 261
},
{
"epoch": 0.10652571660906689,
"grad_norm": 0.12752321362495422,
"learning_rate": 0.00018957866883777732,
"loss": 1.1283,
"step": 262
},
{
"epoch": 0.10693230331368164,
"grad_norm": 0.10688795894384384,
"learning_rate": 0.0001895379605129249,
"loss": 0.9052,
"step": 263
},
{
"epoch": 0.1073388900182964,
"grad_norm": 0.10426552593708038,
"learning_rate": 0.00018949725218807246,
"loss": 0.9556,
"step": 264
},
{
"epoch": 0.10774547672291115,
"grad_norm": 0.09953362494707108,
"learning_rate": 0.00018945654386322004,
"loss": 1.0734,
"step": 265
},
{
"epoch": 0.10815206342752592,
"grad_norm": 0.09143470227718353,
"learning_rate": 0.00018941583553836762,
"loss": 1.0063,
"step": 266
},
{
"epoch": 0.10855865013214068,
"grad_norm": 0.10831563919782639,
"learning_rate": 0.00018937512721351518,
"loss": 1.011,
"step": 267
},
{
"epoch": 0.10896523683675544,
"grad_norm": 0.10352573543787003,
"learning_rate": 0.00018933441888866276,
"loss": 1.0625,
"step": 268
},
{
"epoch": 0.10937182354137019,
"grad_norm": 0.09499429166316986,
"learning_rate": 0.0001892937105638103,
"loss": 0.8775,
"step": 269
},
{
"epoch": 0.10977841024598496,
"grad_norm": 0.10296636819839478,
"learning_rate": 0.00018925300223895787,
"loss": 0.985,
"step": 270
},
{
"epoch": 0.11018499695059972,
"grad_norm": 0.10464894771575928,
"learning_rate": 0.00018921229391410545,
"loss": 1.0051,
"step": 271
},
{
"epoch": 0.11059158365521447,
"grad_norm": 0.09429532289505005,
"learning_rate": 0.000189171585589253,
"loss": 0.9793,
"step": 272
},
{
"epoch": 0.11099817035982923,
"grad_norm": 0.09751992672681808,
"learning_rate": 0.00018913087726440058,
"loss": 1.0756,
"step": 273
},
{
"epoch": 0.111404757064444,
"grad_norm": 0.11418993026018143,
"learning_rate": 0.00018909016893954814,
"loss": 1.0742,
"step": 274
},
{
"epoch": 0.11181134376905875,
"grad_norm": 0.10320629924535751,
"learning_rate": 0.00018904946061469572,
"loss": 1.036,
"step": 275
},
{
"epoch": 0.11221793047367351,
"grad_norm": 0.09697311371564865,
"learning_rate": 0.00018900875228984327,
"loss": 1.0317,
"step": 276
},
{
"epoch": 0.11262451717828827,
"grad_norm": 0.09579788893461227,
"learning_rate": 0.00018896804396499085,
"loss": 0.9621,
"step": 277
},
{
"epoch": 0.11303110388290302,
"grad_norm": 0.09918879717588425,
"learning_rate": 0.00018892733564013843,
"loss": 1.0292,
"step": 278
},
{
"epoch": 0.11343769058751779,
"grad_norm": 0.0923212468624115,
"learning_rate": 0.000188886627315286,
"loss": 1.0611,
"step": 279
},
{
"epoch": 0.11384427729213255,
"grad_norm": 0.09480055421590805,
"learning_rate": 0.00018884591899043357,
"loss": 0.9809,
"step": 280
},
{
"epoch": 0.1142508639967473,
"grad_norm": 0.09431526064872742,
"learning_rate": 0.00018880521066558112,
"loss": 1.0326,
"step": 281
},
{
"epoch": 0.11465745070136206,
"grad_norm": 0.09080514311790466,
"learning_rate": 0.00018876450234072868,
"loss": 0.9115,
"step": 282
},
{
"epoch": 0.11506403740597683,
"grad_norm": 0.10855970531702042,
"learning_rate": 0.00018872379401587626,
"loss": 1.0422,
"step": 283
},
{
"epoch": 0.11547062411059159,
"grad_norm": 0.0941060334444046,
"learning_rate": 0.0001886830856910238,
"loss": 1.0352,
"step": 284
},
{
"epoch": 0.11587721081520634,
"grad_norm": 0.08903583139181137,
"learning_rate": 0.0001886423773661714,
"loss": 0.964,
"step": 285
},
{
"epoch": 0.1162837975198211,
"grad_norm": 0.08521820604801178,
"learning_rate": 0.00018860166904131895,
"loss": 0.917,
"step": 286
},
{
"epoch": 0.11669038422443587,
"grad_norm": 0.1058691143989563,
"learning_rate": 0.00018856096071646653,
"loss": 1.0375,
"step": 287
},
{
"epoch": 0.11709697092905062,
"grad_norm": 0.09435714781284332,
"learning_rate": 0.0001885202523916141,
"loss": 0.9766,
"step": 288
},
{
"epoch": 0.11750355763366538,
"grad_norm": 0.09868729114532471,
"learning_rate": 0.00018847954406676166,
"loss": 1.1059,
"step": 289
},
{
"epoch": 0.11791014433828013,
"grad_norm": 0.08855635672807693,
"learning_rate": 0.00018843883574190924,
"loss": 0.9424,
"step": 290
},
{
"epoch": 0.11831673104289489,
"grad_norm": 0.09142837673425674,
"learning_rate": 0.0001883981274170568,
"loss": 1.0425,
"step": 291
},
{
"epoch": 0.11872331774750966,
"grad_norm": 0.0971277505159378,
"learning_rate": 0.00018835741909220438,
"loss": 1.108,
"step": 292
},
{
"epoch": 0.11912990445212442,
"grad_norm": 0.09940122812986374,
"learning_rate": 0.00018831671076735193,
"loss": 1.0172,
"step": 293
},
{
"epoch": 0.11953649115673917,
"grad_norm": 0.10263317078351974,
"learning_rate": 0.0001882760024424995,
"loss": 1.0956,
"step": 294
},
{
"epoch": 0.11994307786135393,
"grad_norm": 0.1092846542596817,
"learning_rate": 0.00018823529411764707,
"loss": 0.9454,
"step": 295
},
{
"epoch": 0.1203496645659687,
"grad_norm": 0.10364726930856705,
"learning_rate": 0.00018819458579279462,
"loss": 0.8884,
"step": 296
},
{
"epoch": 0.12075625127058345,
"grad_norm": 0.0889100730419159,
"learning_rate": 0.0001881538774679422,
"loss": 0.9922,
"step": 297
},
{
"epoch": 0.12116283797519821,
"grad_norm": 0.09209653735160828,
"learning_rate": 0.00018811316914308976,
"loss": 0.977,
"step": 298
},
{
"epoch": 0.12156942467981297,
"grad_norm": 0.11542046815156937,
"learning_rate": 0.00018807246081823734,
"loss": 1.0694,
"step": 299
},
{
"epoch": 0.12197601138442773,
"grad_norm": 0.10896503180265427,
"learning_rate": 0.00018803175249338492,
"loss": 1.0508,
"step": 300
},
{
"epoch": 0.12238259808904249,
"grad_norm": 0.09302002936601639,
"learning_rate": 0.00018799104416853248,
"loss": 1.0512,
"step": 301
},
{
"epoch": 0.12278918479365725,
"grad_norm": 0.09081271290779114,
"learning_rate": 0.00018795033584368006,
"loss": 0.9688,
"step": 302
},
{
"epoch": 0.123195771498272,
"grad_norm": 0.1059931218624115,
"learning_rate": 0.0001879096275188276,
"loss": 1.0483,
"step": 303
},
{
"epoch": 0.12360235820288677,
"grad_norm": 0.1018669605255127,
"learning_rate": 0.0001878689191939752,
"loss": 1.019,
"step": 304
},
{
"epoch": 0.12400894490750153,
"grad_norm": 0.1040007546544075,
"learning_rate": 0.00018782821086912275,
"loss": 1.037,
"step": 305
},
{
"epoch": 0.12441553161211628,
"grad_norm": 0.10204601287841797,
"learning_rate": 0.0001877875025442703,
"loss": 0.9816,
"step": 306
},
{
"epoch": 0.12482211831673104,
"grad_norm": 0.10591764748096466,
"learning_rate": 0.00018774679421941788,
"loss": 1.0939,
"step": 307
},
{
"epoch": 0.1252287050213458,
"grad_norm": 0.09306305646896362,
"learning_rate": 0.00018770608589456544,
"loss": 1.0476,
"step": 308
},
{
"epoch": 0.12563529172596055,
"grad_norm": 11.22681713104248,
"learning_rate": 0.00018766537756971302,
"loss": 1.0573,
"step": 309
},
{
"epoch": 0.12604187843057532,
"grad_norm": 0.09422402083873749,
"learning_rate": 0.00018762466924486057,
"loss": 0.9993,
"step": 310
},
{
"epoch": 0.1264484651351901,
"grad_norm": 0.0982229933142662,
"learning_rate": 0.00018758396092000815,
"loss": 0.9159,
"step": 311
},
{
"epoch": 0.12685505183980483,
"grad_norm": 0.12579265236854553,
"learning_rate": 0.00018754325259515573,
"loss": 1.0935,
"step": 312
},
{
"epoch": 0.1272616385444196,
"grad_norm": 0.10069390386343002,
"learning_rate": 0.0001875025442703033,
"loss": 1.0127,
"step": 313
},
{
"epoch": 0.12766822524903434,
"grad_norm": 0.10948827862739563,
"learning_rate": 0.00018746183594545087,
"loss": 1.0576,
"step": 314
},
{
"epoch": 0.12807481195364911,
"grad_norm": 0.09232445061206818,
"learning_rate": 0.00018742112762059842,
"loss": 0.9856,
"step": 315
},
{
"epoch": 0.12848139865826388,
"grad_norm": 0.08319563418626785,
"learning_rate": 0.000187380419295746,
"loss": 0.9172,
"step": 316
},
{
"epoch": 0.12888798536287863,
"grad_norm": 0.09697309136390686,
"learning_rate": 0.00018733971097089356,
"loss": 1.0567,
"step": 317
},
{
"epoch": 0.1292945720674934,
"grad_norm": 0.09254255145788193,
"learning_rate": 0.0001872990026460411,
"loss": 1.0177,
"step": 318
},
{
"epoch": 0.12970115877210814,
"grad_norm": 0.09254108369350433,
"learning_rate": 0.0001872582943211887,
"loss": 1.0079,
"step": 319
},
{
"epoch": 0.1301077454767229,
"grad_norm": 0.09095866233110428,
"learning_rate": 0.00018721758599633625,
"loss": 1.0633,
"step": 320
},
{
"epoch": 0.13051433218133768,
"grad_norm": 0.09073010087013245,
"learning_rate": 0.00018717687767148383,
"loss": 0.9059,
"step": 321
},
{
"epoch": 0.13092091888595242,
"grad_norm": 0.09842764586210251,
"learning_rate": 0.00018713616934663138,
"loss": 1.0766,
"step": 322
},
{
"epoch": 0.1313275055905672,
"grad_norm": 0.09325529634952545,
"learning_rate": 0.00018709546102177896,
"loss": 1.066,
"step": 323
},
{
"epoch": 0.13173409229518196,
"grad_norm": 0.09692969918251038,
"learning_rate": 0.00018705475269692654,
"loss": 0.9743,
"step": 324
},
{
"epoch": 0.1321406789997967,
"grad_norm": 0.09432708472013474,
"learning_rate": 0.0001870140443720741,
"loss": 1.0141,
"step": 325
},
{
"epoch": 0.13254726570441147,
"grad_norm": 0.09226994961500168,
"learning_rate": 0.00018697333604722168,
"loss": 0.9837,
"step": 326
},
{
"epoch": 0.1329538524090262,
"grad_norm": 0.10843974351882935,
"learning_rate": 0.00018693262772236923,
"loss": 1.0248,
"step": 327
},
{
"epoch": 0.13336043911364098,
"grad_norm": 0.09324774891138077,
"learning_rate": 0.00018689191939751681,
"loss": 1.0642,
"step": 328
},
{
"epoch": 0.13376702581825575,
"grad_norm": 0.08934729546308517,
"learning_rate": 0.00018685121107266437,
"loss": 0.9792,
"step": 329
},
{
"epoch": 0.1341736125228705,
"grad_norm": 0.09125274419784546,
"learning_rate": 0.00018681050274781192,
"loss": 1.0093,
"step": 330
},
{
"epoch": 0.13458019922748526,
"grad_norm": 0.09645108133554459,
"learning_rate": 0.0001867697944229595,
"loss": 0.9503,
"step": 331
},
{
"epoch": 0.13498678593210003,
"grad_norm": 0.09900861978530884,
"learning_rate": 0.00018672908609810706,
"loss": 0.9966,
"step": 332
},
{
"epoch": 0.13539337263671478,
"grad_norm": 0.09018311649560928,
"learning_rate": 0.00018668837777325464,
"loss": 0.965,
"step": 333
},
{
"epoch": 0.13579995934132955,
"grad_norm": 0.10296136885881424,
"learning_rate": 0.00018664766944840222,
"loss": 1.1011,
"step": 334
},
{
"epoch": 0.1362065460459443,
"grad_norm": 0.09104129672050476,
"learning_rate": 0.00018660696112354977,
"loss": 0.9814,
"step": 335
},
{
"epoch": 0.13661313275055906,
"grad_norm": 0.09881450235843658,
"learning_rate": 0.00018656625279869736,
"loss": 1.0989,
"step": 336
},
{
"epoch": 0.13701971945517383,
"grad_norm": 0.09691241383552551,
"learning_rate": 0.0001865255444738449,
"loss": 1.0967,
"step": 337
},
{
"epoch": 0.13742630615978857,
"grad_norm": 0.10152243077754974,
"learning_rate": 0.0001864848361489925,
"loss": 1.0951,
"step": 338
},
{
"epoch": 0.13783289286440334,
"grad_norm": 0.10802541673183441,
"learning_rate": 0.00018644412782414005,
"loss": 0.8742,
"step": 339
},
{
"epoch": 0.13823947956901808,
"grad_norm": 0.09942565858364105,
"learning_rate": 0.0001864034194992876,
"loss": 0.9961,
"step": 340
},
{
"epoch": 0.13864606627363285,
"grad_norm": 0.08618199825286865,
"learning_rate": 0.00018636271117443518,
"loss": 0.9645,
"step": 341
},
{
"epoch": 0.13905265297824762,
"grad_norm": 0.1056099608540535,
"learning_rate": 0.00018632200284958273,
"loss": 0.9885,
"step": 342
},
{
"epoch": 0.13945923968286236,
"grad_norm": 0.08862382173538208,
"learning_rate": 0.00018628129452473032,
"loss": 0.9316,
"step": 343
},
{
"epoch": 0.13986582638747713,
"grad_norm": 0.09923135489225388,
"learning_rate": 0.00018624058619987787,
"loss": 0.9959,
"step": 344
},
{
"epoch": 0.1402724130920919,
"grad_norm": 0.09120538830757141,
"learning_rate": 0.00018619987787502545,
"loss": 0.968,
"step": 345
},
{
"epoch": 0.14067899979670664,
"grad_norm": 0.09669141471385956,
"learning_rate": 0.00018615916955017303,
"loss": 1.085,
"step": 346
},
{
"epoch": 0.1410855865013214,
"grad_norm": 0.08598754554986954,
"learning_rate": 0.00018611846122532059,
"loss": 0.9504,
"step": 347
},
{
"epoch": 0.14149217320593616,
"grad_norm": 0.09238371253013611,
"learning_rate": 0.00018607775290046817,
"loss": 0.9742,
"step": 348
},
{
"epoch": 0.14189875991055093,
"grad_norm": 0.091258205473423,
"learning_rate": 0.00018603704457561572,
"loss": 0.9341,
"step": 349
},
{
"epoch": 0.1423053466151657,
"grad_norm": 0.10129548609256744,
"learning_rate": 0.0001859963362507633,
"loss": 1.0814,
"step": 350
},
{
"epoch": 0.14271193331978044,
"grad_norm": 0.09523019194602966,
"learning_rate": 0.00018595562792591086,
"loss": 0.9848,
"step": 351
},
{
"epoch": 0.1431185200243952,
"grad_norm": 0.09485248476266861,
"learning_rate": 0.0001859149196010584,
"loss": 0.9828,
"step": 352
},
{
"epoch": 0.14352510672900995,
"grad_norm": 0.09963666647672653,
"learning_rate": 0.000185874211276206,
"loss": 1.1075,
"step": 353
},
{
"epoch": 0.14393169343362472,
"grad_norm": 0.09067155420780182,
"learning_rate": 0.00018583350295135355,
"loss": 0.971,
"step": 354
},
{
"epoch": 0.1443382801382395,
"grad_norm": 0.09153544157743454,
"learning_rate": 0.00018579279462650113,
"loss": 0.9405,
"step": 355
},
{
"epoch": 0.14474486684285423,
"grad_norm": 0.1024472787976265,
"learning_rate": 0.00018575208630164868,
"loss": 0.9967,
"step": 356
},
{
"epoch": 0.145151453547469,
"grad_norm": 0.09804495424032211,
"learning_rate": 0.00018571137797679626,
"loss": 0.9578,
"step": 357
},
{
"epoch": 0.14555804025208377,
"grad_norm": 0.099054716527462,
"learning_rate": 0.00018567066965194384,
"loss": 0.9999,
"step": 358
},
{
"epoch": 0.1459646269566985,
"grad_norm": 0.09781336784362793,
"learning_rate": 0.0001856299613270914,
"loss": 1.09,
"step": 359
},
{
"epoch": 0.14637121366131328,
"grad_norm": 0.08993211388587952,
"learning_rate": 0.00018558925300223898,
"loss": 1.0719,
"step": 360
},
{
"epoch": 0.14677780036592802,
"grad_norm": 0.09146003425121307,
"learning_rate": 0.00018554854467738653,
"loss": 1.0008,
"step": 361
},
{
"epoch": 0.1471843870705428,
"grad_norm": 0.09643495827913284,
"learning_rate": 0.00018550783635253411,
"loss": 1.0791,
"step": 362
},
{
"epoch": 0.14759097377515756,
"grad_norm": 0.09078676998615265,
"learning_rate": 0.00018546712802768167,
"loss": 0.8641,
"step": 363
},
{
"epoch": 0.1479975604797723,
"grad_norm": 0.08719085901975632,
"learning_rate": 0.00018542641970282922,
"loss": 0.985,
"step": 364
},
{
"epoch": 0.14840414718438708,
"grad_norm": 0.09189736843109131,
"learning_rate": 0.0001853857113779768,
"loss": 0.9638,
"step": 365
},
{
"epoch": 0.14881073388900182,
"grad_norm": 0.09381456673145294,
"learning_rate": 0.00018534500305312436,
"loss": 1.0036,
"step": 366
},
{
"epoch": 0.1492173205936166,
"grad_norm": 0.0922684445977211,
"learning_rate": 0.00018530429472827194,
"loss": 1.0391,
"step": 367
},
{
"epoch": 0.14962390729823136,
"grad_norm": 0.09465248882770538,
"learning_rate": 0.0001852635864034195,
"loss": 0.8874,
"step": 368
},
{
"epoch": 0.1500304940028461,
"grad_norm": 0.0938408225774765,
"learning_rate": 0.00018522287807856707,
"loss": 1.0269,
"step": 369
},
{
"epoch": 0.15043708070746087,
"grad_norm": 0.09377933293581009,
"learning_rate": 0.00018518216975371466,
"loss": 1.0142,
"step": 370
},
{
"epoch": 0.15084366741207564,
"grad_norm": 0.1117277517914772,
"learning_rate": 0.0001851414614288622,
"loss": 1.0371,
"step": 371
},
{
"epoch": 0.15125025411669038,
"grad_norm": 0.10293183475732803,
"learning_rate": 0.0001851007531040098,
"loss": 1.0,
"step": 372
},
{
"epoch": 0.15165684082130515,
"grad_norm": 0.09216313809156418,
"learning_rate": 0.00018506004477915734,
"loss": 0.9703,
"step": 373
},
{
"epoch": 0.1520634275259199,
"grad_norm": 0.09088669717311859,
"learning_rate": 0.00018501933645430493,
"loss": 0.8766,
"step": 374
},
{
"epoch": 0.15247001423053466,
"grad_norm": 0.09916643798351288,
"learning_rate": 0.00018497862812945248,
"loss": 1.0958,
"step": 375
},
{
"epoch": 0.15287660093514943,
"grad_norm": 0.08404985070228577,
"learning_rate": 0.00018493791980460003,
"loss": 0.9602,
"step": 376
},
{
"epoch": 0.15328318763976417,
"grad_norm": 0.10011377185583115,
"learning_rate": 0.00018489721147974762,
"loss": 1.0377,
"step": 377
},
{
"epoch": 0.15368977434437894,
"grad_norm": 0.09958089143037796,
"learning_rate": 0.00018485650315489517,
"loss": 1.0213,
"step": 378
},
{
"epoch": 0.15409636104899369,
"grad_norm": 0.09488838911056519,
"learning_rate": 0.00018481579483004275,
"loss": 0.941,
"step": 379
},
{
"epoch": 0.15450294775360846,
"grad_norm": 0.09099314361810684,
"learning_rate": 0.00018477508650519033,
"loss": 0.8913,
"step": 380
},
{
"epoch": 0.15490953445822322,
"grad_norm": 0.0956854447722435,
"learning_rate": 0.00018473437818033789,
"loss": 1.1478,
"step": 381
},
{
"epoch": 0.15531612116283797,
"grad_norm": 0.11225584149360657,
"learning_rate": 0.00018469366985548547,
"loss": 1.0795,
"step": 382
},
{
"epoch": 0.15572270786745274,
"grad_norm": 0.11592987924814224,
"learning_rate": 0.00018465296153063302,
"loss": 1.0863,
"step": 383
},
{
"epoch": 0.1561292945720675,
"grad_norm": 0.09232570976018906,
"learning_rate": 0.0001846122532057806,
"loss": 0.9551,
"step": 384
},
{
"epoch": 0.15653588127668225,
"grad_norm": 0.08860056847333908,
"learning_rate": 0.00018457154488092816,
"loss": 1.0206,
"step": 385
},
{
"epoch": 0.15694246798129702,
"grad_norm": 0.10788331180810928,
"learning_rate": 0.00018453083655607574,
"loss": 0.9378,
"step": 386
},
{
"epoch": 0.15734905468591176,
"grad_norm": 0.10758615285158157,
"learning_rate": 0.0001844901282312233,
"loss": 1.1149,
"step": 387
},
{
"epoch": 0.15775564139052653,
"grad_norm": 0.10551386326551437,
"learning_rate": 0.00018444941990637085,
"loss": 1.0729,
"step": 388
},
{
"epoch": 0.1581622280951413,
"grad_norm": 0.08733198046684265,
"learning_rate": 0.00018440871158151843,
"loss": 1.0058,
"step": 389
},
{
"epoch": 0.15856881479975604,
"grad_norm": 0.1095399409532547,
"learning_rate": 0.00018436800325666598,
"loss": 1.0566,
"step": 390
},
{
"epoch": 0.1589754015043708,
"grad_norm": 0.12356330454349518,
"learning_rate": 0.00018432729493181356,
"loss": 1.0173,
"step": 391
},
{
"epoch": 0.15938198820898555,
"grad_norm": 0.09934639930725098,
"learning_rate": 0.00018428658660696114,
"loss": 1.1237,
"step": 392
},
{
"epoch": 0.15978857491360032,
"grad_norm": 0.09402013570070267,
"learning_rate": 0.0001842458782821087,
"loss": 1.0018,
"step": 393
},
{
"epoch": 0.1601951616182151,
"grad_norm": 0.10511749237775803,
"learning_rate": 0.00018420516995725628,
"loss": 0.9844,
"step": 394
},
{
"epoch": 0.16060174832282983,
"grad_norm": 0.11193688213825226,
"learning_rate": 0.00018416446163240383,
"loss": 0.9888,
"step": 395
},
{
"epoch": 0.1610083350274446,
"grad_norm": 0.09895443916320801,
"learning_rate": 0.00018412375330755141,
"loss": 1.1045,
"step": 396
},
{
"epoch": 0.16141492173205937,
"grad_norm": 0.09660319238901138,
"learning_rate": 0.00018408304498269897,
"loss": 1.0457,
"step": 397
},
{
"epoch": 0.16182150843667412,
"grad_norm": 0.1339186728000641,
"learning_rate": 0.00018404233665784655,
"loss": 1.1266,
"step": 398
},
{
"epoch": 0.16222809514128889,
"grad_norm": 0.1154564693570137,
"learning_rate": 0.0001840016283329941,
"loss": 1.0299,
"step": 399
},
{
"epoch": 0.16263468184590363,
"grad_norm": 0.09698904305696487,
"learning_rate": 0.00018396092000814166,
"loss": 1.1101,
"step": 400
},
{
"epoch": 0.1630412685505184,
"grad_norm": 0.09455164521932602,
"learning_rate": 0.00018392021168328924,
"loss": 0.9928,
"step": 401
},
{
"epoch": 0.16344785525513317,
"grad_norm": 0.09728690981864929,
"learning_rate": 0.0001838795033584368,
"loss": 1.0603,
"step": 402
},
{
"epoch": 0.1638544419597479,
"grad_norm": 0.10577269643545151,
"learning_rate": 0.0001838387950335844,
"loss": 0.9922,
"step": 403
},
{
"epoch": 0.16426102866436268,
"grad_norm": 0.08850935101509094,
"learning_rate": 0.00018379808670873196,
"loss": 0.9758,
"step": 404
},
{
"epoch": 0.16466761536897742,
"grad_norm": 0.09496256709098816,
"learning_rate": 0.0001837573783838795,
"loss": 1.0949,
"step": 405
},
{
"epoch": 0.1650742020735922,
"grad_norm": 0.09768050909042358,
"learning_rate": 0.0001837166700590271,
"loss": 1.0054,
"step": 406
},
{
"epoch": 0.16548078877820696,
"grad_norm": 0.09913921356201172,
"learning_rate": 0.00018367596173417464,
"loss": 1.0272,
"step": 407
},
{
"epoch": 0.1658873754828217,
"grad_norm": 0.0901927724480629,
"learning_rate": 0.00018363525340932223,
"loss": 1.0264,
"step": 408
},
{
"epoch": 0.16629396218743647,
"grad_norm": 0.09796515852212906,
"learning_rate": 0.00018359454508446978,
"loss": 1.0338,
"step": 409
},
{
"epoch": 0.16670054889205124,
"grad_norm": 0.1018638014793396,
"learning_rate": 0.00018355383675961736,
"loss": 1.0409,
"step": 410
},
{
"epoch": 0.16710713559666598,
"grad_norm": 0.10666611790657043,
"learning_rate": 0.00018351312843476492,
"loss": 1.0924,
"step": 411
},
{
"epoch": 0.16751372230128075,
"grad_norm": 0.0986141785979271,
"learning_rate": 0.00018347242010991247,
"loss": 0.9468,
"step": 412
},
{
"epoch": 0.1679203090058955,
"grad_norm": 0.09429168701171875,
"learning_rate": 0.00018343171178506005,
"loss": 0.9706,
"step": 413
},
{
"epoch": 0.16832689571051027,
"grad_norm": 0.09704872965812683,
"learning_rate": 0.0001833910034602076,
"loss": 1.0692,
"step": 414
},
{
"epoch": 0.16873348241512504,
"grad_norm": 0.0980519950389862,
"learning_rate": 0.00018335029513535519,
"loss": 1.0218,
"step": 415
},
{
"epoch": 0.16914006911973978,
"grad_norm": 0.08980212360620499,
"learning_rate": 0.00018330958681050277,
"loss": 0.9243,
"step": 416
},
{
"epoch": 0.16954665582435455,
"grad_norm": 0.09630506485700607,
"learning_rate": 0.00018326887848565032,
"loss": 0.9599,
"step": 417
},
{
"epoch": 0.1699532425289693,
"grad_norm": 0.08608522266149521,
"learning_rate": 0.0001832281701607979,
"loss": 0.9577,
"step": 418
},
{
"epoch": 0.17035982923358406,
"grad_norm": 0.09151248633861542,
"learning_rate": 0.00018318746183594546,
"loss": 0.9956,
"step": 419
},
{
"epoch": 0.17076641593819883,
"grad_norm": 0.09689094871282578,
"learning_rate": 0.00018314675351109304,
"loss": 1.0999,
"step": 420
},
{
"epoch": 0.17117300264281357,
"grad_norm": 0.09316612035036087,
"learning_rate": 0.0001831060451862406,
"loss": 0.8572,
"step": 421
},
{
"epoch": 0.17157958934742834,
"grad_norm": 0.11449979990720749,
"learning_rate": 0.00018306533686138817,
"loss": 1.0328,
"step": 422
},
{
"epoch": 0.1719861760520431,
"grad_norm": 0.10802194476127625,
"learning_rate": 0.00018302462853653573,
"loss": 0.9785,
"step": 423
},
{
"epoch": 0.17239276275665785,
"grad_norm": 0.09997294098138809,
"learning_rate": 0.00018298392021168328,
"loss": 0.9778,
"step": 424
},
{
"epoch": 0.17279934946127262,
"grad_norm": 0.10244690626859665,
"learning_rate": 0.00018294321188683086,
"loss": 1.0874,
"step": 425
},
{
"epoch": 0.17320593616588736,
"grad_norm": 0.10659472644329071,
"learning_rate": 0.00018290250356197844,
"loss": 1.0196,
"step": 426
},
{
"epoch": 0.17361252287050213,
"grad_norm": 0.09812036156654358,
"learning_rate": 0.000182861795237126,
"loss": 0.9051,
"step": 427
},
{
"epoch": 0.1740191095751169,
"grad_norm": 0.845235288143158,
"learning_rate": 0.00018282108691227358,
"loss": 1.0531,
"step": 428
},
{
"epoch": 0.17442569627973165,
"grad_norm": 0.109995998442173,
"learning_rate": 0.00018278037858742113,
"loss": 1.001,
"step": 429
},
{
"epoch": 0.17483228298434642,
"grad_norm": 0.12578758597373962,
"learning_rate": 0.00018273967026256871,
"loss": 0.9513,
"step": 430
},
{
"epoch": 0.17523886968896116,
"grad_norm": 0.1585826873779297,
"learning_rate": 0.00018269896193771627,
"loss": 1.0091,
"step": 431
},
{
"epoch": 0.17564545639357593,
"grad_norm": 0.15150819718837738,
"learning_rate": 0.00018265825361286385,
"loss": 1.1045,
"step": 432
},
{
"epoch": 0.1760520430981907,
"grad_norm": 0.1110219806432724,
"learning_rate": 0.0001826175452880114,
"loss": 0.9877,
"step": 433
},
{
"epoch": 0.17645862980280544,
"grad_norm": 0.11296675354242325,
"learning_rate": 0.00018257683696315896,
"loss": 1.1317,
"step": 434
},
{
"epoch": 0.1768652165074202,
"grad_norm": 0.11464451253414154,
"learning_rate": 0.00018253612863830654,
"loss": 0.9485,
"step": 435
},
{
"epoch": 0.17727180321203498,
"grad_norm": 0.08836513012647629,
"learning_rate": 0.0001824954203134541,
"loss": 0.8667,
"step": 436
},
{
"epoch": 0.17767838991664972,
"grad_norm": 0.10697431862354279,
"learning_rate": 0.00018245471198860167,
"loss": 1.0692,
"step": 437
},
{
"epoch": 0.1780849766212645,
"grad_norm": 0.10565032064914703,
"learning_rate": 0.00018241400366374925,
"loss": 1.0723,
"step": 438
},
{
"epoch": 0.17849156332587923,
"grad_norm": 0.11343531310558319,
"learning_rate": 0.0001823732953388968,
"loss": 1.1038,
"step": 439
},
{
"epoch": 0.178898150030494,
"grad_norm": 0.10002034902572632,
"learning_rate": 0.0001823325870140444,
"loss": 0.9859,
"step": 440
},
{
"epoch": 0.17930473673510877,
"grad_norm": 0.10602378845214844,
"learning_rate": 0.00018229187868919194,
"loss": 1.1091,
"step": 441
},
{
"epoch": 0.1797113234397235,
"grad_norm": 0.09775001555681229,
"learning_rate": 0.00018225117036433953,
"loss": 1.0473,
"step": 442
},
{
"epoch": 0.18011791014433828,
"grad_norm": 0.09872320294380188,
"learning_rate": 0.00018221046203948708,
"loss": 1.0657,
"step": 443
},
{
"epoch": 0.18052449684895303,
"grad_norm": 0.0893816128373146,
"learning_rate": 0.00018216975371463466,
"loss": 0.915,
"step": 444
},
{
"epoch": 0.1809310835535678,
"grad_norm": 0.09870447218418121,
"learning_rate": 0.00018212904538978221,
"loss": 0.8847,
"step": 445
},
{
"epoch": 0.18133767025818257,
"grad_norm": 0.09775330871343613,
"learning_rate": 0.00018208833706492977,
"loss": 0.841,
"step": 446
},
{
"epoch": 0.1817442569627973,
"grad_norm": 0.10025996714830399,
"learning_rate": 0.00018204762874007735,
"loss": 0.9965,
"step": 447
},
{
"epoch": 0.18215084366741208,
"grad_norm": 0.09369905292987823,
"learning_rate": 0.0001820069204152249,
"loss": 0.9998,
"step": 448
},
{
"epoch": 0.18255743037202685,
"grad_norm": 0.09244808554649353,
"learning_rate": 0.0001819662120903725,
"loss": 0.9938,
"step": 449
},
{
"epoch": 0.1829640170766416,
"grad_norm": 0.12163155525922775,
"learning_rate": 0.00018192550376552007,
"loss": 1.1384,
"step": 450
},
{
"epoch": 0.18337060378125636,
"grad_norm": 0.08755457401275635,
"learning_rate": 0.00018188479544066762,
"loss": 0.9002,
"step": 451
},
{
"epoch": 0.1837771904858711,
"grad_norm": 0.0917607769370079,
"learning_rate": 0.0001818440871158152,
"loss": 0.9874,
"step": 452
},
{
"epoch": 0.18418377719048587,
"grad_norm": 0.09113719314336777,
"learning_rate": 0.00018180337879096276,
"loss": 1.0187,
"step": 453
},
{
"epoch": 0.18459036389510064,
"grad_norm": 0.08795943111181259,
"learning_rate": 0.00018176267046611034,
"loss": 0.902,
"step": 454
},
{
"epoch": 0.18499695059971538,
"grad_norm": 0.1016731783747673,
"learning_rate": 0.0001817219621412579,
"loss": 0.9933,
"step": 455
},
{
"epoch": 0.18540353730433015,
"grad_norm": 0.09413068741559982,
"learning_rate": 0.00018168125381640547,
"loss": 0.9448,
"step": 456
},
{
"epoch": 0.18581012400894492,
"grad_norm": 0.10015012323856354,
"learning_rate": 0.00018164054549155303,
"loss": 1.1458,
"step": 457
},
{
"epoch": 0.18621671071355966,
"grad_norm": 0.09086768329143524,
"learning_rate": 0.00018159983716670058,
"loss": 1.0543,
"step": 458
},
{
"epoch": 0.18662329741817443,
"grad_norm": 0.10910352319478989,
"learning_rate": 0.00018155912884184816,
"loss": 1.0078,
"step": 459
},
{
"epoch": 0.18702988412278918,
"grad_norm": 0.09674135595560074,
"learning_rate": 0.00018151842051699572,
"loss": 0.9758,
"step": 460
},
{
"epoch": 0.18743647082740394,
"grad_norm": 0.09108126163482666,
"learning_rate": 0.00018147771219214332,
"loss": 1.0038,
"step": 461
},
{
"epoch": 0.18784305753201871,
"grad_norm": 0.09710326045751572,
"learning_rate": 0.00018143700386729088,
"loss": 0.9693,
"step": 462
},
{
"epoch": 0.18824964423663346,
"grad_norm": 0.10069318860769272,
"learning_rate": 0.00018139629554243843,
"loss": 1.1005,
"step": 463
},
{
"epoch": 0.18865623094124823,
"grad_norm": 0.09434141218662262,
"learning_rate": 0.000181355587217586,
"loss": 1.0359,
"step": 464
},
{
"epoch": 0.18906281764586297,
"grad_norm": 0.09208261221647263,
"learning_rate": 0.00018131487889273357,
"loss": 1.0374,
"step": 465
},
{
"epoch": 0.18946940435047774,
"grad_norm": 0.09581121802330017,
"learning_rate": 0.00018127417056788115,
"loss": 1.0267,
"step": 466
},
{
"epoch": 0.1898759910550925,
"grad_norm": 0.09809669107198715,
"learning_rate": 0.0001812334622430287,
"loss": 1.0652,
"step": 467
},
{
"epoch": 0.19028257775970725,
"grad_norm": 0.08496394008398056,
"learning_rate": 0.00018119275391817628,
"loss": 0.9468,
"step": 468
},
{
"epoch": 0.19068916446432202,
"grad_norm": 0.09247399121522903,
"learning_rate": 0.00018115204559332384,
"loss": 1.0247,
"step": 469
},
{
"epoch": 0.1910957511689368,
"grad_norm": 0.10010971128940582,
"learning_rate": 0.0001811113372684714,
"loss": 0.9674,
"step": 470
},
{
"epoch": 0.19150233787355153,
"grad_norm": 0.09562191367149353,
"learning_rate": 0.00018107062894361897,
"loss": 0.9819,
"step": 471
},
{
"epoch": 0.1919089245781663,
"grad_norm": 0.09223975241184235,
"learning_rate": 0.00018102992061876655,
"loss": 1.0051,
"step": 472
},
{
"epoch": 0.19231551128278104,
"grad_norm": 0.09564565122127533,
"learning_rate": 0.00018098921229391414,
"loss": 0.908,
"step": 473
},
{
"epoch": 0.1927220979873958,
"grad_norm": 0.09371364116668701,
"learning_rate": 0.0001809485039690617,
"loss": 1.0195,
"step": 474
},
{
"epoch": 0.19312868469201058,
"grad_norm": 0.0895533412694931,
"learning_rate": 0.00018090779564420924,
"loss": 0.8912,
"step": 475
},
{
"epoch": 0.19353527139662532,
"grad_norm": 0.08874888718128204,
"learning_rate": 0.00018086708731935682,
"loss": 0.9941,
"step": 476
},
{
"epoch": 0.1939418581012401,
"grad_norm": 8989.1748046875,
"learning_rate": 0.00018082637899450438,
"loss": 1.0191,
"step": 477
},
{
"epoch": 0.19434844480585484,
"grad_norm": 0.09893982112407684,
"learning_rate": 0.00018078567066965196,
"loss": 1.1682,
"step": 478
},
{
"epoch": 0.1947550315104696,
"grad_norm": 0.09100797772407532,
"learning_rate": 0.00018074496234479951,
"loss": 0.9466,
"step": 479
},
{
"epoch": 0.19516161821508438,
"grad_norm": 0.10540256649255753,
"learning_rate": 0.0001807042540199471,
"loss": 1.0735,
"step": 480
},
{
"epoch": 0.19556820491969912,
"grad_norm": 0.09110235422849655,
"learning_rate": 0.00018066354569509465,
"loss": 1.0097,
"step": 481
},
{
"epoch": 0.1959747916243139,
"grad_norm": 0.10651825368404388,
"learning_rate": 0.0001806228373702422,
"loss": 1.014,
"step": 482
},
{
"epoch": 0.19638137832892866,
"grad_norm": 0.08685674518346786,
"learning_rate": 0.00018058212904538978,
"loss": 0.9755,
"step": 483
},
{
"epoch": 0.1967879650335434,
"grad_norm": 0.10092045366764069,
"learning_rate": 0.00018054142072053737,
"loss": 0.9397,
"step": 484
},
{
"epoch": 0.19719455173815817,
"grad_norm": 0.1056622639298439,
"learning_rate": 0.00018050071239568495,
"loss": 0.9864,
"step": 485
},
{
"epoch": 0.1976011384427729,
"grad_norm": 0.10525202006101608,
"learning_rate": 0.0001804600040708325,
"loss": 1.1085,
"step": 486
},
{
"epoch": 0.19800772514738768,
"grad_norm": 0.10073073953390121,
"learning_rate": 0.00018041929574598006,
"loss": 1.1264,
"step": 487
},
{
"epoch": 0.19841431185200245,
"grad_norm": 0.09659091383218765,
"learning_rate": 0.00018037858742112764,
"loss": 0.9848,
"step": 488
},
{
"epoch": 0.1988208985566172,
"grad_norm": 0.09986629337072372,
"learning_rate": 0.0001803378790962752,
"loss": 1.0732,
"step": 489
},
{
"epoch": 0.19922748526123196,
"grad_norm": 0.11215290427207947,
"learning_rate": 0.00018029717077142277,
"loss": 1.1259,
"step": 490
},
{
"epoch": 0.1996340719658467,
"grad_norm": 0.11136343330144882,
"learning_rate": 0.00018025646244657033,
"loss": 1.0857,
"step": 491
},
{
"epoch": 0.20004065867046147,
"grad_norm": 0.10452030599117279,
"learning_rate": 0.0001802157541217179,
"loss": 0.9997,
"step": 492
},
{
"epoch": 0.20044724537507624,
"grad_norm": 0.10394178330898285,
"learning_rate": 0.00018017504579686546,
"loss": 1.0852,
"step": 493
},
{
"epoch": 0.20085383207969099,
"grad_norm": 0.10206598043441772,
"learning_rate": 0.00018013433747201302,
"loss": 0.9629,
"step": 494
},
{
"epoch": 0.20126041878430576,
"grad_norm": 0.09365608543157578,
"learning_rate": 0.00018009362914716062,
"loss": 0.9504,
"step": 495
},
{
"epoch": 0.20166700548892053,
"grad_norm": 0.09425178170204163,
"learning_rate": 0.00018005292082230818,
"loss": 1.0038,
"step": 496
},
{
"epoch": 0.20207359219353527,
"grad_norm": 0.09562011808156967,
"learning_rate": 0.00018001221249745576,
"loss": 1.0877,
"step": 497
},
{
"epoch": 0.20248017889815004,
"grad_norm": 0.11452426016330719,
"learning_rate": 0.0001799715041726033,
"loss": 1.0688,
"step": 498
},
{
"epoch": 0.20288676560276478,
"grad_norm": 0.0930696651339531,
"learning_rate": 0.00017993079584775087,
"loss": 1.0255,
"step": 499
},
{
"epoch": 0.20329335230737955,
"grad_norm": 0.10522327572107315,
"learning_rate": 0.00017989008752289845,
"loss": 1.085,
"step": 500
},
{
"epoch": 0.20369993901199432,
"grad_norm": 0.08499190211296082,
"learning_rate": 0.000179849379198046,
"loss": 0.9235,
"step": 501
},
{
"epoch": 0.20410652571660906,
"grad_norm": 0.09169955551624298,
"learning_rate": 0.00017980867087319358,
"loss": 0.9836,
"step": 502
},
{
"epoch": 0.20451311242122383,
"grad_norm": 0.10331466048955917,
"learning_rate": 0.00017976796254834114,
"loss": 1.0255,
"step": 503
},
{
"epoch": 0.20491969912583857,
"grad_norm": 0.0900363028049469,
"learning_rate": 0.00017972725422348872,
"loss": 0.9691,
"step": 504
},
{
"epoch": 0.20532628583045334,
"grad_norm": 0.10095544904470444,
"learning_rate": 0.00017968654589863627,
"loss": 1.0289,
"step": 505
},
{
"epoch": 0.2057328725350681,
"grad_norm": 0.0992627814412117,
"learning_rate": 0.00017964583757378383,
"loss": 0.9785,
"step": 506
},
{
"epoch": 0.20613945923968285,
"grad_norm": 0.0954422652721405,
"learning_rate": 0.00017960512924893144,
"loss": 1.0105,
"step": 507
},
{
"epoch": 0.20654604594429762,
"grad_norm": 0.0994410440325737,
"learning_rate": 0.000179564420924079,
"loss": 1.0894,
"step": 508
},
{
"epoch": 0.2069526326489124,
"grad_norm": 0.08866444230079651,
"learning_rate": 0.00017952371259922654,
"loss": 0.9725,
"step": 509
},
{
"epoch": 0.20735921935352714,
"grad_norm": 0.09361348301172256,
"learning_rate": 0.00017948300427437412,
"loss": 1.0441,
"step": 510
},
{
"epoch": 0.2077658060581419,
"grad_norm": 0.08215323090553284,
"learning_rate": 0.00017944229594952168,
"loss": 0.9214,
"step": 511
},
{
"epoch": 0.20817239276275665,
"grad_norm": 0.09752262383699417,
"learning_rate": 0.00017940158762466926,
"loss": 0.9456,
"step": 512
},
{
"epoch": 0.20857897946737142,
"grad_norm": 0.10021419823169708,
"learning_rate": 0.00017936087929981681,
"loss": 1.1158,
"step": 513
},
{
"epoch": 0.2089855661719862,
"grad_norm": 0.09550227969884872,
"learning_rate": 0.0001793201709749644,
"loss": 0.9789,
"step": 514
},
{
"epoch": 0.20939215287660093,
"grad_norm": 0.09059977531433105,
"learning_rate": 0.00017927946265011195,
"loss": 0.9649,
"step": 515
},
{
"epoch": 0.2097987395812157,
"grad_norm": 0.09227627515792847,
"learning_rate": 0.00017923875432525953,
"loss": 0.9779,
"step": 516
},
{
"epoch": 0.21020532628583044,
"grad_norm": 0.09919798374176025,
"learning_rate": 0.00017919804600040708,
"loss": 1.0155,
"step": 517
},
{
"epoch": 0.2106119129904452,
"grad_norm": 0.09044051915407181,
"learning_rate": 0.00017915733767555464,
"loss": 0.9428,
"step": 518
},
{
"epoch": 0.21101849969505998,
"grad_norm": 0.09017504006624222,
"learning_rate": 0.00017911662935070225,
"loss": 0.9244,
"step": 519
},
{
"epoch": 0.21142508639967472,
"grad_norm": 0.09257036447525024,
"learning_rate": 0.0001790759210258498,
"loss": 1.0168,
"step": 520
},
{
"epoch": 0.2118316731042895,
"grad_norm": 0.0926235020160675,
"learning_rate": 0.00017903521270099735,
"loss": 0.9363,
"step": 521
},
{
"epoch": 0.21223825980890426,
"grad_norm": 0.08785069733858109,
"learning_rate": 0.00017899450437614494,
"loss": 0.9428,
"step": 522
},
{
"epoch": 0.212644846513519,
"grad_norm": 0.09824348986148834,
"learning_rate": 0.0001789537960512925,
"loss": 1.0378,
"step": 523
},
{
"epoch": 0.21305143321813377,
"grad_norm": 0.0915142148733139,
"learning_rate": 0.00017891308772644007,
"loss": 0.9603,
"step": 524
},
{
"epoch": 0.21345801992274852,
"grad_norm": 0.09466978907585144,
"learning_rate": 0.00017887237940158763,
"loss": 1.013,
"step": 525
},
{
"epoch": 0.21386460662736329,
"grad_norm": 0.09305880963802338,
"learning_rate": 0.0001788316710767352,
"loss": 0.9386,
"step": 526
},
{
"epoch": 0.21427119333197805,
"grad_norm": 0.09210691601037979,
"learning_rate": 0.00017879096275188276,
"loss": 0.9797,
"step": 527
},
{
"epoch": 0.2146777800365928,
"grad_norm": 0.10415366291999817,
"learning_rate": 0.00017875025442703031,
"loss": 1.0125,
"step": 528
},
{
"epoch": 0.21508436674120757,
"grad_norm": 0.10259640216827393,
"learning_rate": 0.0001787095461021779,
"loss": 1.0473,
"step": 529
},
{
"epoch": 0.2154909534458223,
"grad_norm": 0.09523239731788635,
"learning_rate": 0.00017866883777732548,
"loss": 0.9603,
"step": 530
},
{
"epoch": 0.21589754015043708,
"grad_norm": 0.10005185008049011,
"learning_rate": 0.00017862812945247306,
"loss": 1.0768,
"step": 531
},
{
"epoch": 0.21630412685505185,
"grad_norm": 0.09643250703811646,
"learning_rate": 0.0001785874211276206,
"loss": 1.0799,
"step": 532
},
{
"epoch": 0.2167107135596666,
"grad_norm": 0.09473159909248352,
"learning_rate": 0.00017854671280276817,
"loss": 1.0657,
"step": 533
},
{
"epoch": 0.21711730026428136,
"grad_norm": 0.09550385922193527,
"learning_rate": 0.00017850600447791575,
"loss": 1.0389,
"step": 534
},
{
"epoch": 0.21752388696889613,
"grad_norm": 0.09414463490247726,
"learning_rate": 0.0001784652961530633,
"loss": 1.0317,
"step": 535
},
{
"epoch": 0.21793047367351087,
"grad_norm": 0.090250164270401,
"learning_rate": 0.00017842458782821088,
"loss": 1.0212,
"step": 536
},
{
"epoch": 0.21833706037812564,
"grad_norm": 0.09635050594806671,
"learning_rate": 0.00017838387950335844,
"loss": 0.9473,
"step": 537
},
{
"epoch": 0.21874364708274038,
"grad_norm": 0.0985347330570221,
"learning_rate": 0.00017834317117850602,
"loss": 1.1372,
"step": 538
},
{
"epoch": 0.21915023378735515,
"grad_norm": 0.09789203107357025,
"learning_rate": 0.00017830246285365357,
"loss": 1.0369,
"step": 539
},
{
"epoch": 0.21955682049196992,
"grad_norm": 0.09777568280696869,
"learning_rate": 0.00017826175452880113,
"loss": 1.0746,
"step": 540
},
{
"epoch": 0.21996340719658466,
"grad_norm": 0.09013503789901733,
"learning_rate": 0.0001782210462039487,
"loss": 1.0124,
"step": 541
},
{
"epoch": 0.22036999390119943,
"grad_norm": 0.10604355484247208,
"learning_rate": 0.0001781803378790963,
"loss": 1.0158,
"step": 542
},
{
"epoch": 0.22077658060581418,
"grad_norm": 0.09194648265838623,
"learning_rate": 0.00017813962955424387,
"loss": 0.9544,
"step": 543
},
{
"epoch": 0.22118316731042895,
"grad_norm": 0.09223110228776932,
"learning_rate": 0.00017809892122939142,
"loss": 1.0094,
"step": 544
},
{
"epoch": 0.22158975401504372,
"grad_norm": 0.09049870073795319,
"learning_rate": 0.00017805821290453898,
"loss": 0.8829,
"step": 545
},
{
"epoch": 0.22199634071965846,
"grad_norm": 0.10157813131809235,
"learning_rate": 0.00017801750457968656,
"loss": 1.0904,
"step": 546
},
{
"epoch": 0.22240292742427323,
"grad_norm": 0.09934356063604355,
"learning_rate": 0.0001779767962548341,
"loss": 1.0708,
"step": 547
},
{
"epoch": 0.222809514128888,
"grad_norm": 0.09037156403064728,
"learning_rate": 0.0001779360879299817,
"loss": 0.916,
"step": 548
},
{
"epoch": 0.22321610083350274,
"grad_norm": 0.09347829967737198,
"learning_rate": 0.00017789537960512925,
"loss": 1.0328,
"step": 549
},
{
"epoch": 0.2236226875381175,
"grad_norm": 0.087796151638031,
"learning_rate": 0.00017785467128027683,
"loss": 0.9961,
"step": 550
},
{
"epoch": 0.22402927424273225,
"grad_norm": 0.09518422931432724,
"learning_rate": 0.00017781396295542438,
"loss": 0.9855,
"step": 551
},
{
"epoch": 0.22443586094734702,
"grad_norm": 0.09606748074293137,
"learning_rate": 0.00017777325463057194,
"loss": 0.954,
"step": 552
},
{
"epoch": 0.2248424476519618,
"grad_norm": 0.09338165074586868,
"learning_rate": 0.00017773254630571955,
"loss": 1.0876,
"step": 553
},
{
"epoch": 0.22524903435657653,
"grad_norm": 0.09242440015077591,
"learning_rate": 0.0001776918379808671,
"loss": 0.9418,
"step": 554
},
{
"epoch": 0.2256556210611913,
"grad_norm": 0.0990302637219429,
"learning_rate": 0.00017765112965601468,
"loss": 1.0641,
"step": 555
},
{
"epoch": 0.22606220776580604,
"grad_norm": 0.09444238990545273,
"learning_rate": 0.00017761042133116224,
"loss": 1.0315,
"step": 556
},
{
"epoch": 0.22646879447042081,
"grad_norm": 0.08771083503961563,
"learning_rate": 0.0001775697130063098,
"loss": 0.9898,
"step": 557
},
{
"epoch": 0.22687538117503558,
"grad_norm": 0.10041147470474243,
"learning_rate": 0.00017752900468145737,
"loss": 1.0478,
"step": 558
},
{
"epoch": 0.22728196787965033,
"grad_norm": 0.0933571383357048,
"learning_rate": 0.00017748829635660492,
"loss": 1.0002,
"step": 559
},
{
"epoch": 0.2276885545842651,
"grad_norm": 0.0912991389632225,
"learning_rate": 0.0001774475880317525,
"loss": 1.0807,
"step": 560
},
{
"epoch": 0.22809514128887987,
"grad_norm": 0.09350984543561935,
"learning_rate": 0.00017740687970690006,
"loss": 0.8962,
"step": 561
},
{
"epoch": 0.2285017279934946,
"grad_norm": 0.0978541299700737,
"learning_rate": 0.00017736617138204764,
"loss": 1.0339,
"step": 562
},
{
"epoch": 0.22890831469810938,
"grad_norm": 0.08964958041906357,
"learning_rate": 0.0001773254630571952,
"loss": 1.051,
"step": 563
},
{
"epoch": 0.22931490140272412,
"grad_norm": 0.09241898357868195,
"learning_rate": 0.00017728475473234275,
"loss": 0.903,
"step": 564
},
{
"epoch": 0.2297214881073389,
"grad_norm": 0.09366483986377716,
"learning_rate": 0.00017724404640749036,
"loss": 1.0055,
"step": 565
},
{
"epoch": 0.23012807481195366,
"grad_norm": 0.10184673964977264,
"learning_rate": 0.0001772033380826379,
"loss": 1.004,
"step": 566
},
{
"epoch": 0.2305346615165684,
"grad_norm": 0.09287306666374207,
"learning_rate": 0.0001771626297577855,
"loss": 0.9667,
"step": 567
},
{
"epoch": 0.23094124822118317,
"grad_norm": 0.08905091136693954,
"learning_rate": 0.00017712192143293305,
"loss": 0.9295,
"step": 568
},
{
"epoch": 0.2313478349257979,
"grad_norm": 0.0908786877989769,
"learning_rate": 0.0001770812131080806,
"loss": 0.8957,
"step": 569
},
{
"epoch": 0.23175442163041268,
"grad_norm": 0.10284281522035599,
"learning_rate": 0.00017704050478322818,
"loss": 1.1311,
"step": 570
},
{
"epoch": 0.23216100833502745,
"grad_norm": 0.09007006883621216,
"learning_rate": 0.00017699979645837574,
"loss": 0.9919,
"step": 571
},
{
"epoch": 0.2325675950396422,
"grad_norm": 0.09025272727012634,
"learning_rate": 0.00017695908813352332,
"loss": 0.9057,
"step": 572
},
{
"epoch": 0.23297418174425696,
"grad_norm": 0.0994710698723793,
"learning_rate": 0.00017691837980867087,
"loss": 1.1472,
"step": 573
},
{
"epoch": 0.23338076844887173,
"grad_norm": 0.09117428958415985,
"learning_rate": 0.00017687767148381845,
"loss": 0.9665,
"step": 574
},
{
"epoch": 0.23378735515348648,
"grad_norm": 0.0893009826540947,
"learning_rate": 0.000176836963158966,
"loss": 0.951,
"step": 575
},
{
"epoch": 0.23419394185810125,
"grad_norm": 0.08649599552154541,
"learning_rate": 0.0001767962548341136,
"loss": 0.925,
"step": 576
},
{
"epoch": 0.234600528562716,
"grad_norm": 0.0928448736667633,
"learning_rate": 0.00017675554650926117,
"loss": 0.9253,
"step": 577
},
{
"epoch": 0.23500711526733076,
"grad_norm": 0.10335158556699753,
"learning_rate": 0.00017671483818440872,
"loss": 1.1171,
"step": 578
},
{
"epoch": 0.23541370197194553,
"grad_norm": 0.09889842569828033,
"learning_rate": 0.0001766741298595563,
"loss": 1.0005,
"step": 579
},
{
"epoch": 0.23582028867656027,
"grad_norm": 0.09655506163835526,
"learning_rate": 0.00017663342153470386,
"loss": 1.0273,
"step": 580
},
{
"epoch": 0.23622687538117504,
"grad_norm": 0.09516560286283493,
"learning_rate": 0.0001765927132098514,
"loss": 1.024,
"step": 581
},
{
"epoch": 0.23663346208578978,
"grad_norm": 0.10024843364953995,
"learning_rate": 0.000176552004884999,
"loss": 1.0299,
"step": 582
},
{
"epoch": 0.23704004879040455,
"grad_norm": 0.10152596235275269,
"learning_rate": 0.00017651129656014655,
"loss": 0.9658,
"step": 583
},
{
"epoch": 0.23744663549501932,
"grad_norm": 0.09654249995946884,
"learning_rate": 0.00017647058823529413,
"loss": 1.0722,
"step": 584
},
{
"epoch": 0.23785322219963406,
"grad_norm": 0.09112072736024857,
"learning_rate": 0.00017642987991044168,
"loss": 0.9846,
"step": 585
},
{
"epoch": 0.23825980890424883,
"grad_norm": 0.09640034288167953,
"learning_rate": 0.00017638917158558926,
"loss": 1.0501,
"step": 586
},
{
"epoch": 0.2386663956088636,
"grad_norm": 0.09564584493637085,
"learning_rate": 0.00017634846326073682,
"loss": 0.955,
"step": 587
},
{
"epoch": 0.23907298231347834,
"grad_norm": 0.10815359652042389,
"learning_rate": 0.0001763077549358844,
"loss": 1.203,
"step": 588
},
{
"epoch": 0.2394795690180931,
"grad_norm": 0.09078256040811539,
"learning_rate": 0.00017626704661103198,
"loss": 0.9881,
"step": 589
},
{
"epoch": 0.23988615572270786,
"grad_norm": 0.09075487405061722,
"learning_rate": 0.00017622633828617954,
"loss": 0.984,
"step": 590
},
{
"epoch": 0.24029274242732263,
"grad_norm": 0.09048381447792053,
"learning_rate": 0.00017618562996132712,
"loss": 1.0235,
"step": 591
},
{
"epoch": 0.2406993291319374,
"grad_norm": 0.09820905327796936,
"learning_rate": 0.00017614492163647467,
"loss": 0.9763,
"step": 592
},
{
"epoch": 0.24110591583655214,
"grad_norm": 0.0961097925901413,
"learning_rate": 0.00017610421331162222,
"loss": 1.1035,
"step": 593
},
{
"epoch": 0.2415125025411669,
"grad_norm": 0.0877358540892601,
"learning_rate": 0.0001760635049867698,
"loss": 0.8962,
"step": 594
},
{
"epoch": 0.24191908924578168,
"grad_norm": 0.09730017930269241,
"learning_rate": 0.00017602279666191736,
"loss": 1.1232,
"step": 595
},
{
"epoch": 0.24232567595039642,
"grad_norm": 0.09486240148544312,
"learning_rate": 0.00017598208833706494,
"loss": 1.0566,
"step": 596
},
{
"epoch": 0.2427322626550112,
"grad_norm": 0.09367606788873672,
"learning_rate": 0.0001759413800122125,
"loss": 0.9934,
"step": 597
},
{
"epoch": 0.24313884935962593,
"grad_norm": 0.09046703577041626,
"learning_rate": 0.00017590067168736008,
"loss": 0.9137,
"step": 598
},
{
"epoch": 0.2435454360642407,
"grad_norm": 0.09512536972761154,
"learning_rate": 0.00017585996336250766,
"loss": 0.9733,
"step": 599
},
{
"epoch": 0.24395202276885547,
"grad_norm": 0.08619649708271027,
"learning_rate": 0.0001758192550376552,
"loss": 0.8777,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 4918,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.906257354398122e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}