{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 318, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031446540880503146, "grad_norm": 0.06697794049978256, "learning_rate": 1.3333333333333333e-05, "loss": 1.3911, "step": 1 }, { "epoch": 0.006289308176100629, "grad_norm": 0.06842195242643356, "learning_rate": 2.6666666666666667e-05, "loss": 1.3973, "step": 2 }, { "epoch": 0.009433962264150943, "grad_norm": 0.06460106372833252, "learning_rate": 4e-05, "loss": 1.4096, "step": 3 }, { "epoch": 0.012578616352201259, "grad_norm": 0.0661928579211235, "learning_rate": 5.333333333333333e-05, "loss": 1.4273, "step": 4 }, { "epoch": 0.015723270440251572, "grad_norm": 0.06443281471729279, "learning_rate": 6.666666666666667e-05, "loss": 1.4056, "step": 5 }, { "epoch": 0.018867924528301886, "grad_norm": 0.06869488209486008, "learning_rate": 8e-05, "loss": 1.3854, "step": 6 }, { "epoch": 0.0220125786163522, "grad_norm": 0.06870010495185852, "learning_rate": 9.333333333333334e-05, "loss": 1.4054, "step": 7 }, { "epoch": 0.025157232704402517, "grad_norm": 0.072841115295887, "learning_rate": 0.00010666666666666667, "loss": 1.3904, "step": 8 }, { "epoch": 0.02830188679245283, "grad_norm": 0.06708419322967529, "learning_rate": 0.00012, "loss": 1.4035, "step": 9 }, { "epoch": 0.031446540880503145, "grad_norm": 0.06746525317430496, "learning_rate": 0.00013333333333333334, "loss": 1.3638, "step": 10 }, { "epoch": 0.03459119496855346, "grad_norm": 0.06512407958507538, "learning_rate": 0.00014666666666666666, "loss": 1.3506, "step": 11 }, { "epoch": 0.03773584905660377, "grad_norm": 0.05993957445025444, "learning_rate": 0.00016, "loss": 1.3772, "step": 12 }, { "epoch": 0.040880503144654086, "grad_norm": 0.059435583651065826, "learning_rate": 0.00017333333333333334, "loss": 1.3517, "step": 13 }, { "epoch": 0.0440251572327044, "grad_norm": 0.05579576641321182, "learning_rate": 0.0001866666666666667, "loss": 1.4105, "step": 14 }, { "epoch": 0.04716981132075472, "grad_norm": 0.04974433407187462, "learning_rate": 0.0002, "loss": 1.3162, "step": 15 }, { "epoch": 0.050314465408805034, "grad_norm": 0.04716808721423149, "learning_rate": 0.00019999462497359466, "loss": 1.3283, "step": 16 }, { "epoch": 0.05345911949685535, "grad_norm": 0.04468343406915665, "learning_rate": 0.0001999785004721968, "loss": 1.359, "step": 17 }, { "epoch": 0.05660377358490566, "grad_norm": 0.05088884010910988, "learning_rate": 0.00019995162822919883, "loss": 1.3484, "step": 18 }, { "epoch": 0.059748427672955975, "grad_norm": 0.05735902860760689, "learning_rate": 0.00019991401113338104, "loss": 1.3326, "step": 19 }, { "epoch": 0.06289308176100629, "grad_norm": 0.06404463201761246, "learning_rate": 0.00019986565322860115, "loss": 1.3403, "step": 20 }, { "epoch": 0.0660377358490566, "grad_norm": 0.0681706890463829, "learning_rate": 0.00019980655971335945, "loss": 1.3228, "step": 21 }, { "epoch": 0.06918238993710692, "grad_norm": 0.07040446251630783, "learning_rate": 0.00019973673694024, "loss": 1.3087, "step": 22 }, { "epoch": 0.07232704402515723, "grad_norm": 0.0640912875533104, "learning_rate": 0.0001996561924152278, "loss": 1.2798, "step": 23 }, { "epoch": 0.07547169811320754, "grad_norm": 0.05613941699266434, "learning_rate": 0.0001995649347969019, "loss": 1.2888, "step": 24 }, { "epoch": 0.07861635220125786, "grad_norm": 0.051170893013477325, "learning_rate": 0.00019946297389550433, "loss": 1.2697, "step": 25 }, { "epoch": 0.08176100628930817, "grad_norm": 0.044640567153692245, "learning_rate": 0.0001993503206718859, "loss": 1.2898, "step": 26 }, { "epoch": 0.08490566037735849, "grad_norm": 0.040906600654125214, "learning_rate": 0.00019922698723632767, "loss": 1.247, "step": 27 }, { "epoch": 0.0880503144654088, "grad_norm": 0.03780093416571617, "learning_rate": 0.00019909298684723904, "loss": 1.2751, "step": 28 }, { "epoch": 0.09119496855345911, "grad_norm": 0.03710748627781868, "learning_rate": 0.00019894833390973266, "loss": 1.287, "step": 29 }, { "epoch": 0.09433962264150944, "grad_norm": 0.03594716638326645, "learning_rate": 0.0001987930439740757, "loss": 1.2385, "step": 30 }, { "epoch": 0.09748427672955975, "grad_norm": 0.03679339960217476, "learning_rate": 0.0001986271337340182, "loss": 1.2415, "step": 31 }, { "epoch": 0.10062893081761007, "grad_norm": 0.03725181892514229, "learning_rate": 0.0001984506210249986, "loss": 1.2268, "step": 32 }, { "epoch": 0.10377358490566038, "grad_norm": 0.037984397262334824, "learning_rate": 0.00019826352482222638, "loss": 1.2402, "step": 33 }, { "epoch": 0.1069182389937107, "grad_norm": 0.037509895861148834, "learning_rate": 0.0001980658652386421, "loss": 1.2221, "step": 34 }, { "epoch": 0.11006289308176101, "grad_norm": 0.03687283396720886, "learning_rate": 0.00019785766352275542, "loss": 1.2386, "step": 35 }, { "epoch": 0.11320754716981132, "grad_norm": 0.03444783389568329, "learning_rate": 0.00019763894205636072, "loss": 1.2427, "step": 36 }, { "epoch": 0.11635220125786164, "grad_norm": 0.032733093947172165, "learning_rate": 0.00019740972435213115, "loss": 1.2309, "step": 37 }, { "epoch": 0.11949685534591195, "grad_norm": 0.029699521139264107, "learning_rate": 0.00019717003505109095, "loss": 1.2479, "step": 38 }, { "epoch": 0.12264150943396226, "grad_norm": 0.02603563852608204, "learning_rate": 0.00019691989991996663, "loss": 1.2196, "step": 39 }, { "epoch": 0.12578616352201258, "grad_norm": 0.02650611288845539, "learning_rate": 0.00019665934584841682, "loss": 1.2269, "step": 40 }, { "epoch": 0.1289308176100629, "grad_norm": 0.027458857744932175, "learning_rate": 0.00019638840084614182, "loss": 1.2625, "step": 41 }, { "epoch": 0.1320754716981132, "grad_norm": 0.027038419619202614, "learning_rate": 0.00019610709403987246, "loss": 1.231, "step": 42 }, { "epoch": 0.13522012578616352, "grad_norm": 0.02573474682867527, "learning_rate": 0.000195815455670239, "loss": 1.1967, "step": 43 }, { "epoch": 0.13836477987421383, "grad_norm": 0.026413045823574066, "learning_rate": 0.0001955135170885202, "loss": 1.1999, "step": 44 }, { "epoch": 0.14150943396226415, "grad_norm": 0.02456706203520298, "learning_rate": 0.00019520131075327298, "loss": 1.1724, "step": 45 }, { "epoch": 0.14465408805031446, "grad_norm": 0.02324003167450428, "learning_rate": 0.00019487887022684336, "loss": 1.1732, "step": 46 }, { "epoch": 0.14779874213836477, "grad_norm": 0.02492634579539299, "learning_rate": 0.00019454623017175812, "loss": 1.1922, "step": 47 }, { "epoch": 0.1509433962264151, "grad_norm": 0.026481660082936287, "learning_rate": 0.0001942034263469989, "loss": 1.1889, "step": 48 }, { "epoch": 0.1540880503144654, "grad_norm": 0.023594651371240616, "learning_rate": 0.00019385049560415794, "loss": 1.1819, "step": 49 }, { "epoch": 0.15723270440251572, "grad_norm": 0.024192512035369873, "learning_rate": 0.00019348747588347637, "loss": 1.1691, "step": 50 }, { "epoch": 0.16037735849056603, "grad_norm": 0.023232240229845047, "learning_rate": 0.00019311440620976597, "loss": 1.1819, "step": 51 }, { "epoch": 0.16352201257861634, "grad_norm": 0.02279943972826004, "learning_rate": 0.00019273132668821364, "loss": 1.2022, "step": 52 }, { "epoch": 0.16666666666666666, "grad_norm": 0.06585303694009781, "learning_rate": 0.00019233827850007027, "loss": 1.135, "step": 53 }, { "epoch": 0.16981132075471697, "grad_norm": 0.024625560268759727, "learning_rate": 0.00019193530389822363, "loss": 1.1774, "step": 54 }, { "epoch": 0.17295597484276728, "grad_norm": 0.024765564128756523, "learning_rate": 0.0001915224462026563, "loss": 1.1848, "step": 55 }, { "epoch": 0.1761006289308176, "grad_norm": 0.0233647171407938, "learning_rate": 0.0001910997497957885, "loss": 1.1821, "step": 56 }, { "epoch": 0.1792452830188679, "grad_norm": 0.02151089534163475, "learning_rate": 0.00019066726011770726, "loss": 1.1458, "step": 57 }, { "epoch": 0.18238993710691823, "grad_norm": 0.022214526310563087, "learning_rate": 0.00019022502366128135, "loss": 1.1492, "step": 58 }, { "epoch": 0.18553459119496854, "grad_norm": 0.0223999060690403, "learning_rate": 0.0001897730879671634, "loss": 1.1703, "step": 59 }, { "epoch": 0.18867924528301888, "grad_norm": 0.023374218493700027, "learning_rate": 0.00018931150161867916, "loss": 1.1797, "step": 60 }, { "epoch": 0.1918238993710692, "grad_norm": 0.02511228248476982, "learning_rate": 0.0001888403142366049, "loss": 1.2078, "step": 61 }, { "epoch": 0.1949685534591195, "grad_norm": 0.02414465881884098, "learning_rate": 0.00018835957647383303, "loss": 1.1902, "step": 62 }, { "epoch": 0.19811320754716982, "grad_norm": 0.02244570665061474, "learning_rate": 0.00018786934000992688, "loss": 1.1541, "step": 63 }, { "epoch": 0.20125786163522014, "grad_norm": 0.023515688255429268, "learning_rate": 0.00018736965754556528, "loss": 1.1401, "step": 64 }, { "epoch": 0.20440251572327045, "grad_norm": 0.02403687871992588, "learning_rate": 0.00018686058279687698, "loss": 1.1526, "step": 65 }, { "epoch": 0.20754716981132076, "grad_norm": 0.022151008248329163, "learning_rate": 0.00018634217048966637, "loss": 1.164, "step": 66 }, { "epoch": 0.21069182389937108, "grad_norm": 0.022764768451452255, "learning_rate": 0.0001858144763535302, "loss": 1.1572, "step": 67 }, { "epoch": 0.2138364779874214, "grad_norm": 0.024172818288207054, "learning_rate": 0.00018527755711586678, "loss": 1.1561, "step": 68 }, { "epoch": 0.2169811320754717, "grad_norm": 0.023120006546378136, "learning_rate": 0.00018473147049577774, "loss": 1.1264, "step": 69 }, { "epoch": 0.22012578616352202, "grad_norm": 0.022340824827551842, "learning_rate": 0.00018417627519786315, "loss": 1.1471, "step": 70 }, { "epoch": 0.22327044025157233, "grad_norm": 0.02570510096848011, "learning_rate": 0.00018361203090591071, "loss": 1.1302, "step": 71 }, { "epoch": 0.22641509433962265, "grad_norm": 0.02528996579349041, "learning_rate": 0.00018303879827647975, "loss": 1.1347, "step": 72 }, { "epoch": 0.22955974842767296, "grad_norm": 0.02298339456319809, "learning_rate": 0.00018245663893238075, "loss": 1.1202, "step": 73 }, { "epoch": 0.23270440251572327, "grad_norm": 0.023198647424578667, "learning_rate": 0.00018186561545605054, "loss": 1.1285, "step": 74 }, { "epoch": 0.2358490566037736, "grad_norm": 0.02332969196140766, "learning_rate": 0.00018126579138282503, "loss": 1.1382, "step": 75 }, { "epoch": 0.2389937106918239, "grad_norm": 0.023921016603708267, "learning_rate": 0.00018065723119410884, "loss": 1.1508, "step": 76 }, { "epoch": 0.24213836477987422, "grad_norm": 0.027694478631019592, "learning_rate": 0.0001800400003104436, "loss": 1.1209, "step": 77 }, { "epoch": 0.24528301886792453, "grad_norm": 0.02638174593448639, "learning_rate": 0.00017941416508447536, "loss": 1.1551, "step": 78 }, { "epoch": 0.24842767295597484, "grad_norm": 0.024342985823750496, "learning_rate": 0.00017877979279382135, "loss": 1.1033, "step": 79 }, { "epoch": 0.25157232704402516, "grad_norm": 0.0234859399497509, "learning_rate": 0.0001781369516338378, "loss": 1.0988, "step": 80 }, { "epoch": 0.25471698113207547, "grad_norm": 0.025128323584794998, "learning_rate": 0.000177485710710289, "loss": 1.1248, "step": 81 }, { "epoch": 0.2578616352201258, "grad_norm": 0.025595176964998245, "learning_rate": 0.00017682614003191807, "loss": 1.1429, "step": 82 }, { "epoch": 0.2610062893081761, "grad_norm": 0.02447207272052765, "learning_rate": 0.0001761583105029213, "loss": 1.0941, "step": 83 }, { "epoch": 0.2641509433962264, "grad_norm": 0.026511628180742264, "learning_rate": 0.00017548229391532572, "loss": 1.1529, "step": 84 }, { "epoch": 0.2672955974842767, "grad_norm": 0.026698730885982513, "learning_rate": 0.00017479816294127152, "loss": 1.0938, "step": 85 }, { "epoch": 0.27044025157232704, "grad_norm": 0.028718404471874237, "learning_rate": 0.0001741059911251997, "loss": 1.1071, "step": 86 }, { "epoch": 0.27358490566037735, "grad_norm": 0.02812567539513111, "learning_rate": 0.00017340585287594604, "loss": 1.1382, "step": 87 }, { "epoch": 0.27672955974842767, "grad_norm": 0.025351839140057564, "learning_rate": 0.00017269782345874203, "loss": 1.1061, "step": 88 }, { "epoch": 0.279874213836478, "grad_norm": 0.02537315897643566, "learning_rate": 0.00017198197898712404, "loss": 1.0935, "step": 89 }, { "epoch": 0.2830188679245283, "grad_norm": 0.027423014864325523, "learning_rate": 0.00017125839641475072, "loss": 1.0954, "step": 90 }, { "epoch": 0.2861635220125786, "grad_norm": 0.027652902528643608, "learning_rate": 0.00017052715352713075, "loss": 1.0975, "step": 91 }, { "epoch": 0.2893081761006289, "grad_norm": 0.029060475528240204, "learning_rate": 0.00016978832893326074, "loss": 1.1008, "step": 92 }, { "epoch": 0.29245283018867924, "grad_norm": 0.02606775052845478, "learning_rate": 0.0001690420020571747, "loss": 1.1125, "step": 93 }, { "epoch": 0.29559748427672955, "grad_norm": 0.025361906737089157, "learning_rate": 0.00016828825312940592, "loss": 1.1225, "step": 94 }, { "epoch": 0.29874213836477986, "grad_norm": 0.029504677280783653, "learning_rate": 0.00016752716317836229, "loss": 1.1281, "step": 95 }, { "epoch": 0.3018867924528302, "grad_norm": 0.027163010090589523, "learning_rate": 0.00016675881402161536, "loss": 1.0891, "step": 96 }, { "epoch": 0.3050314465408805, "grad_norm": 0.028238749131560326, "learning_rate": 0.00016598328825710533, "loss": 1.0732, "step": 97 }, { "epoch": 0.3081761006289308, "grad_norm": 0.02860194444656372, "learning_rate": 0.00016520066925426144, "loss": 1.1109, "step": 98 }, { "epoch": 0.3113207547169811, "grad_norm": 0.027443770319223404, "learning_rate": 0.0001644110411450398, "loss": 1.1037, "step": 99 }, { "epoch": 0.31446540880503143, "grad_norm": 0.02937367372214794, "learning_rate": 0.00016361448881487914, "loss": 1.1614, "step": 100 }, { "epoch": 0.31761006289308175, "grad_norm": 0.028245460242033005, "learning_rate": 0.0001628110978935756, "loss": 1.1193, "step": 101 }, { "epoch": 0.32075471698113206, "grad_norm": 0.03091912530362606, "learning_rate": 0.00016200095474607753, "loss": 1.0811, "step": 102 }, { "epoch": 0.3238993710691824, "grad_norm": 0.029428910464048386, "learning_rate": 0.0001611841464632011, "loss": 1.0946, "step": 103 }, { "epoch": 0.3270440251572327, "grad_norm": 0.02842988260090351, "learning_rate": 0.00016036076085226814, "loss": 1.0921, "step": 104 }, { "epoch": 0.330188679245283, "grad_norm": 0.028155898675322533, "learning_rate": 0.0001595308864276666, "loss": 1.0929, "step": 105 }, { "epoch": 0.3333333333333333, "grad_norm": 0.03080359846353531, "learning_rate": 0.0001586946124013354, "loss": 1.1039, "step": 106 }, { "epoch": 0.33647798742138363, "grad_norm": 0.03177150338888168, "learning_rate": 0.00015785202867317407, "loss": 1.0986, "step": 107 }, { "epoch": 0.33962264150943394, "grad_norm": 0.026763366535305977, "learning_rate": 0.00015700322582137827, "loss": 1.0686, "step": 108 }, { "epoch": 0.34276729559748426, "grad_norm": 0.027751443907618523, "learning_rate": 0.0001561482950927029, "loss": 1.1177, "step": 109 }, { "epoch": 0.34591194968553457, "grad_norm": 0.031205786392092705, "learning_rate": 0.00015528732839265272, "loss": 1.1045, "step": 110 }, { "epoch": 0.3490566037735849, "grad_norm": 0.029671067371964455, "learning_rate": 0.00015442041827560274, "loss": 1.0815, "step": 111 }, { "epoch": 0.3522012578616352, "grad_norm": 0.03158772736787796, "learning_rate": 0.00015354765793484834, "loss": 1.0811, "step": 112 }, { "epoch": 0.3553459119496855, "grad_norm": 0.03245990723371506, "learning_rate": 0.000152669141192587, "loss": 1.0923, "step": 113 }, { "epoch": 0.3584905660377358, "grad_norm": 0.030182786285877228, "learning_rate": 0.00015178496248983254, "loss": 1.0831, "step": 114 }, { "epoch": 0.36163522012578614, "grad_norm": 0.03249813988804817, "learning_rate": 0.00015089521687626243, "loss": 1.0955, "step": 115 }, { "epoch": 0.36477987421383645, "grad_norm": 0.029901932924985886, "learning_rate": 0.00015000000000000001, "loss": 1.0567, "step": 116 }, { "epoch": 0.36792452830188677, "grad_norm": 0.0314863882958889, "learning_rate": 0.00014909940809733222, "loss": 1.0759, "step": 117 }, { "epoch": 0.3710691823899371, "grad_norm": 0.03128151595592499, "learning_rate": 0.00014819353798236427, "loss": 1.0878, "step": 118 }, { "epoch": 0.3742138364779874, "grad_norm": 0.03267417103052139, "learning_rate": 0.00014728248703661182, "loss": 1.094, "step": 119 }, { "epoch": 0.37735849056603776, "grad_norm": 0.031713493168354034, "learning_rate": 0.00014636635319853275, "loss": 1.0845, "step": 120 }, { "epoch": 0.3805031446540881, "grad_norm": 0.029322847723960876, "learning_rate": 0.00014544523495299842, "loss": 1.0683, "step": 121 }, { "epoch": 0.3836477987421384, "grad_norm": 0.03310471028089523, "learning_rate": 0.0001445192313207067, "loss": 1.0798, "step": 122 }, { "epoch": 0.3867924528301887, "grad_norm": 0.03195233270525932, "learning_rate": 0.00014358844184753712, "loss": 1.0697, "step": 123 }, { "epoch": 0.389937106918239, "grad_norm": 0.034240156412124634, "learning_rate": 0.00014265296659384956, "loss": 1.0885, "step": 124 }, { "epoch": 0.39308176100628933, "grad_norm": 0.03241978958249092, "learning_rate": 0.0001417129061237278, "loss": 1.0647, "step": 125 }, { "epoch": 0.39622641509433965, "grad_norm": 0.03068430908024311, "learning_rate": 0.00014076836149416887, "loss": 1.0647, "step": 126 }, { "epoch": 0.39937106918238996, "grad_norm": 0.03438032045960426, "learning_rate": 0.00013981943424421932, "loss": 1.0939, "step": 127 }, { "epoch": 0.4025157232704403, "grad_norm": 0.031215351074934006, "learning_rate": 0.00013886622638405952, "loss": 1.0694, "step": 128 }, { "epoch": 0.4056603773584906, "grad_norm": 0.035429947078228, "learning_rate": 0.00013790884038403795, "loss": 1.1149, "step": 129 }, { "epoch": 0.4088050314465409, "grad_norm": 0.03237266466021538, "learning_rate": 0.00013694737916365517, "loss": 1.0778, "step": 130 }, { "epoch": 0.4119496855345912, "grad_norm": 0.034300774335861206, "learning_rate": 0.0001359819460805001, "loss": 1.0872, "step": 131 }, { "epoch": 0.41509433962264153, "grad_norm": 0.03338664770126343, "learning_rate": 0.00013501264491913906, "loss": 1.0809, "step": 132 }, { "epoch": 0.41823899371069184, "grad_norm": 0.03169442340731621, "learning_rate": 0.00013403957987995882, "loss": 1.0436, "step": 133 }, { "epoch": 0.42138364779874216, "grad_norm": 0.03536612167954445, "learning_rate": 0.00013306285556796495, "loss": 1.0643, "step": 134 }, { "epoch": 0.42452830188679247, "grad_norm": 0.03312570974230766, "learning_rate": 0.00013208257698153677, "loss": 1.0879, "step": 135 }, { "epoch": 0.4276729559748428, "grad_norm": 0.035323478281497955, "learning_rate": 0.00013109884950114007, "loss": 1.0946, "step": 136 }, { "epoch": 0.4308176100628931, "grad_norm": 0.03307751566171646, "learning_rate": 0.00013011177887799845, "loss": 1.0574, "step": 137 }, { "epoch": 0.4339622641509434, "grad_norm": 0.03208519518375397, "learning_rate": 0.00012912147122272523, "loss": 1.0563, "step": 138 }, { "epoch": 0.4371069182389937, "grad_norm": 0.03215700760483742, "learning_rate": 0.00012812803299391628, "loss": 1.0615, "step": 139 }, { "epoch": 0.44025157232704404, "grad_norm": 0.03540361300110817, "learning_rate": 0.0001271315709867059, "loss": 1.0903, "step": 140 }, { "epoch": 0.44339622641509435, "grad_norm": 0.03418035805225372, "learning_rate": 0.00012613219232128608, "loss": 1.0589, "step": 141 }, { "epoch": 0.44654088050314467, "grad_norm": 0.032720983028411865, "learning_rate": 0.00012513000443139112, "loss": 1.0394, "step": 142 }, { "epoch": 0.449685534591195, "grad_norm": 0.03251456469297409, "learning_rate": 0.00012412511505274844, "loss": 1.0459, "step": 143 }, { "epoch": 0.4528301886792453, "grad_norm": 0.03547577187418938, "learning_rate": 0.000123117632211497, "loss": 1.0919, "step": 144 }, { "epoch": 0.4559748427672956, "grad_norm": 0.03621995821595192, "learning_rate": 0.0001221076642125742, "loss": 1.0428, "step": 145 }, { "epoch": 0.4591194968553459, "grad_norm": 0.03383413329720497, "learning_rate": 0.00012109531962807332, "loss": 1.0704, "step": 146 }, { "epoch": 0.46226415094339623, "grad_norm": 0.031702034175395966, "learning_rate": 0.00012008070728557186, "loss": 1.0328, "step": 147 }, { "epoch": 0.46540880503144655, "grad_norm": 0.039653629064559937, "learning_rate": 0.00011906393625643244, "loss": 1.0568, "step": 148 }, { "epoch": 0.46855345911949686, "grad_norm": 0.037315912544727325, "learning_rate": 0.00011804511584407763, "loss": 1.0668, "step": 149 }, { "epoch": 0.4716981132075472, "grad_norm": 0.03531115874648094, "learning_rate": 0.00011702435557223987, "loss": 1.0827, "step": 150 }, { "epoch": 0.4748427672955975, "grad_norm": 0.03649010509252548, "learning_rate": 0.00011600176517318741, "loss": 1.0796, "step": 151 }, { "epoch": 0.4779874213836478, "grad_norm": 0.04164504259824753, "learning_rate": 0.00011497745457592816, "loss": 1.0314, "step": 152 }, { "epoch": 0.4811320754716981, "grad_norm": 0.037900954484939575, "learning_rate": 0.00011395153389439233, "loss": 1.0668, "step": 153 }, { "epoch": 0.48427672955974843, "grad_norm": 0.034743502736091614, "learning_rate": 0.0001129241134155949, "loss": 1.0575, "step": 154 }, { "epoch": 0.48742138364779874, "grad_norm": 0.05526720732450485, "learning_rate": 0.00011189530358778005, "loss": 1.0537, "step": 155 }, { "epoch": 0.49056603773584906, "grad_norm": 0.03674091398715973, "learning_rate": 0.00011086521500854745, "loss": 1.0612, "step": 156 }, { "epoch": 0.4937106918238994, "grad_norm": 0.03560490161180496, "learning_rate": 0.00010983395841296348, "loss": 1.0461, "step": 157 }, { "epoch": 0.4968553459119497, "grad_norm": 0.03683093190193176, "learning_rate": 0.00010880164466165674, "loss": 1.0489, "step": 158 }, { "epoch": 0.5, "grad_norm": 0.034947801381349564, "learning_rate": 0.00010776838472890065, "loss": 1.0908, "step": 159 }, { "epoch": 0.5031446540880503, "grad_norm": 0.034155167639255524, "learning_rate": 0.00010673428969068364, "loss": 1.0883, "step": 160 }, { "epoch": 0.5062893081761006, "grad_norm": 0.03542330116033554, "learning_rate": 0.00010569947071276847, "loss": 1.0629, "step": 161 }, { "epoch": 0.5094339622641509, "grad_norm": 0.0372898206114769, "learning_rate": 0.00010466403903874176, "loss": 1.0515, "step": 162 }, { "epoch": 0.5125786163522013, "grad_norm": 0.03636344522237778, "learning_rate": 0.00010362810597805526, "loss": 1.0905, "step": 163 }, { "epoch": 0.5157232704402516, "grad_norm": 0.035335466265678406, "learning_rate": 0.00010259178289406011, "loss": 1.0698, "step": 164 }, { "epoch": 0.5188679245283019, "grad_norm": 0.036180030554533005, "learning_rate": 0.0001015551811920351, "loss": 1.0487, "step": 165 }, { "epoch": 0.5220125786163522, "grad_norm": 0.03546663746237755, "learning_rate": 0.00010051841230721065, "loss": 1.0336, "step": 166 }, { "epoch": 0.5251572327044025, "grad_norm": 0.03683155030012131, "learning_rate": 9.948158769278939e-05, "loss": 1.0628, "step": 167 }, { "epoch": 0.5283018867924528, "grad_norm": 0.03633348271250725, "learning_rate": 9.844481880796491e-05, "loss": 1.0646, "step": 168 }, { "epoch": 0.5314465408805031, "grad_norm": 0.03651515021920204, "learning_rate": 9.740821710593989e-05, "loss": 1.0584, "step": 169 }, { "epoch": 0.5345911949685535, "grad_norm": 0.03433886170387268, "learning_rate": 9.637189402194476e-05, "loss": 1.0537, "step": 170 }, { "epoch": 0.5377358490566038, "grad_norm": 0.046192716807127, "learning_rate": 9.533596096125825e-05, "loss": 1.0409, "step": 171 }, { "epoch": 0.5408805031446541, "grad_norm": 0.03568156436085701, "learning_rate": 9.430052928723153e-05, "loss": 1.0278, "step": 172 }, { "epoch": 0.5440251572327044, "grad_norm": 0.040810681879520416, "learning_rate": 9.326571030931637e-05, "loss": 1.0405, "step": 173 }, { "epoch": 0.5471698113207547, "grad_norm": 0.03588728979229927, "learning_rate": 9.223161527109937e-05, "loss": 1.065, "step": 174 }, { "epoch": 0.550314465408805, "grad_norm": 0.03548993915319443, "learning_rate": 9.119835533834331e-05, "loss": 1.0065, "step": 175 }, { "epoch": 0.5534591194968553, "grad_norm": 0.04264102876186371, "learning_rate": 9.016604158703654e-05, "loss": 1.0668, "step": 176 }, { "epoch": 0.5566037735849056, "grad_norm": 0.03986184671521187, "learning_rate": 8.913478499145254e-05, "loss": 1.0512, "step": 177 }, { "epoch": 0.559748427672956, "grad_norm": 0.03871089220046997, "learning_rate": 8.810469641222001e-05, "loss": 1.0413, "step": 178 }, { "epoch": 0.5628930817610063, "grad_norm": 0.03574568033218384, "learning_rate": 8.707588658440511e-05, "loss": 1.0293, "step": 179 }, { "epoch": 0.5660377358490566, "grad_norm": 0.037175796926021576, "learning_rate": 8.604846610560771e-05, "loss": 1.0246, "step": 180 }, { "epoch": 0.5691823899371069, "grad_norm": 0.04187128692865372, "learning_rate": 8.502254542407186e-05, "loss": 1.023, "step": 181 }, { "epoch": 0.5723270440251572, "grad_norm": 0.04172036051750183, "learning_rate": 8.399823482681262e-05, "loss": 1.0455, "step": 182 }, { "epoch": 0.5754716981132075, "grad_norm": 0.03626122325658798, "learning_rate": 8.297564442776014e-05, "loss": 1.0457, "step": 183 }, { "epoch": 0.5786163522012578, "grad_norm": 0.03596337512135506, "learning_rate": 8.195488415592238e-05, "loss": 1.0521, "step": 184 }, { "epoch": 0.5817610062893082, "grad_norm": 0.03914599120616913, "learning_rate": 8.093606374356759e-05, "loss": 1.0645, "step": 185 }, { "epoch": 0.5849056603773585, "grad_norm": 0.044063687324523926, "learning_rate": 7.991929271442817e-05, "loss": 1.0677, "step": 186 }, { "epoch": 0.5880503144654088, "grad_norm": 0.04163552075624466, "learning_rate": 7.89046803719267e-05, "loss": 1.0568, "step": 187 }, { "epoch": 0.5911949685534591, "grad_norm": 0.036366574466228485, "learning_rate": 7.789233578742582e-05, "loss": 1.0038, "step": 188 }, { "epoch": 0.5943396226415094, "grad_norm": 0.04061400517821312, "learning_rate": 7.688236778850306e-05, "loss": 1.0462, "step": 189 }, { "epoch": 0.5974842767295597, "grad_norm": 0.03604275360703468, "learning_rate": 7.587488494725157e-05, "loss": 1.0275, "step": 190 }, { "epoch": 0.60062893081761, "grad_norm": 0.03972569853067398, "learning_rate": 7.48699955686089e-05, "loss": 1.0402, "step": 191 }, { "epoch": 0.6037735849056604, "grad_norm": 0.04172028228640556, "learning_rate": 7.386780767871397e-05, "loss": 1.0416, "step": 192 }, { "epoch": 0.6069182389937107, "grad_norm": 0.03570333123207092, "learning_rate": 7.286842901329412e-05, "loss": 1.0459, "step": 193 }, { "epoch": 0.610062893081761, "grad_norm": 0.037412162870168686, "learning_rate": 7.187196700608373e-05, "loss": 1.0556, "step": 194 }, { "epoch": 0.6132075471698113, "grad_norm": 0.038102056831121445, "learning_rate": 7.087852877727481e-05, "loss": 1.0301, "step": 195 }, { "epoch": 0.6163522012578616, "grad_norm": 0.037487804889678955, "learning_rate": 6.988822112200156e-05, "loss": 1.0494, "step": 196 }, { "epoch": 0.6194968553459119, "grad_norm": 0.03777475655078888, "learning_rate": 6.890115049885994e-05, "loss": 0.9972, "step": 197 }, { "epoch": 0.6226415094339622, "grad_norm": 0.04026506096124649, "learning_rate": 6.791742301846326e-05, "loss": 1.0068, "step": 198 }, { "epoch": 0.6257861635220126, "grad_norm": 0.03857170045375824, "learning_rate": 6.693714443203507e-05, "loss": 1.0468, "step": 199 }, { "epoch": 0.6289308176100629, "grad_norm": 0.038687944412231445, "learning_rate": 6.59604201200412e-05, "loss": 1.021, "step": 200 }, { "epoch": 0.6320754716981132, "grad_norm": 0.03843434900045395, "learning_rate": 6.498735508086093e-05, "loss": 1.0443, "step": 201 }, { "epoch": 0.6352201257861635, "grad_norm": 0.03765735775232315, "learning_rate": 6.40180539194999e-05, "loss": 1.0068, "step": 202 }, { "epoch": 0.6383647798742138, "grad_norm": 0.038186896592378616, "learning_rate": 6.305262083634488e-05, "loss": 1.0368, "step": 203 }, { "epoch": 0.6415094339622641, "grad_norm": 0.03744081035256386, "learning_rate": 6.209115961596208e-05, "loss": 1.0035, "step": 204 }, { "epoch": 0.6446540880503144, "grad_norm": 0.03738857060670853, "learning_rate": 6.113377361594049e-05, "loss": 1.0343, "step": 205 }, { "epoch": 0.6477987421383647, "grad_norm": 0.03938114643096924, "learning_rate": 6.018056575578075e-05, "loss": 1.041, "step": 206 }, { "epoch": 0.6509433962264151, "grad_norm": 0.0429544560611248, "learning_rate": 5.923163850583113e-05, "loss": 1.0455, "step": 207 }, { "epoch": 0.6540880503144654, "grad_norm": 0.03791610524058342, "learning_rate": 5.828709387627218e-05, "loss": 1.0284, "step": 208 }, { "epoch": 0.6572327044025157, "grad_norm": 0.038352545350790024, "learning_rate": 5.73470334061505e-05, "loss": 1.0279, "step": 209 }, { "epoch": 0.660377358490566, "grad_norm": 0.03907958045601845, "learning_rate": 5.6411558152462894e-05, "loss": 1.0711, "step": 210 }, { "epoch": 0.6635220125786163, "grad_norm": 0.03748472407460213, "learning_rate": 5.54807686792933e-05, "loss": 1.0187, "step": 211 }, { "epoch": 0.6666666666666666, "grad_norm": 0.13872897624969482, "learning_rate": 5.4554765047001613e-05, "loss": 1.0482, "step": 212 }, { "epoch": 0.6698113207547169, "grad_norm": 0.04007211700081825, "learning_rate": 5.363364680146725e-05, "loss": 1.0525, "step": 213 }, { "epoch": 0.6729559748427673, "grad_norm": 0.038152776658535004, "learning_rate": 5.271751296338823e-05, "loss": 1.0222, "step": 214 }, { "epoch": 0.6761006289308176, "grad_norm": 0.03928610309958458, "learning_rate": 5.180646201763577e-05, "loss": 1.06, "step": 215 }, { "epoch": 0.6792452830188679, "grad_norm": 0.03823390603065491, "learning_rate": 5.090059190266779e-05, "loss": 1.006, "step": 216 }, { "epoch": 0.6823899371069182, "grad_norm": 0.03753795474767685, "learning_rate": 5.000000000000002e-05, "loss": 1.0471, "step": 217 }, { "epoch": 0.6855345911949685, "grad_norm": 0.03927240148186684, "learning_rate": 4.9104783123737566e-05, "loss": 1.0211, "step": 218 }, { "epoch": 0.6886792452830188, "grad_norm": 0.038637347519397736, "learning_rate": 4.821503751016746e-05, "loss": 1.0393, "step": 219 }, { "epoch": 0.6918238993710691, "grad_norm": 0.04003263637423515, "learning_rate": 4.733085880741301e-05, "loss": 1.0387, "step": 220 }, { "epoch": 0.6949685534591195, "grad_norm": 0.037788983434438705, "learning_rate": 4.645234206515171e-05, "loss": 1.0395, "step": 221 }, { "epoch": 0.6981132075471698, "grad_norm": 0.037437207996845245, "learning_rate": 4.5579581724397255e-05, "loss": 1.002, "step": 222 }, { "epoch": 0.7012578616352201, "grad_norm": 0.03973449021577835, "learning_rate": 4.471267160734731e-05, "loss": 1.0101, "step": 223 }, { "epoch": 0.7044025157232704, "grad_norm": 0.04157485440373421, "learning_rate": 4.385170490729712e-05, "loss": 1.0547, "step": 224 }, { "epoch": 0.7075471698113207, "grad_norm": 0.03971412032842636, "learning_rate": 4.2996774178621736e-05, "loss": 1.0327, "step": 225 }, { "epoch": 0.710691823899371, "grad_norm": 0.042363688349723816, "learning_rate": 4.2147971326825966e-05, "loss": 1.0115, "step": 226 }, { "epoch": 0.7138364779874213, "grad_norm": 0.03927742689847946, "learning_rate": 4.130538759866457e-05, "loss": 1.037, "step": 227 }, { "epoch": 0.7169811320754716, "grad_norm": 0.04383242875337601, "learning_rate": 4.046911357233343e-05, "loss": 1.0336, "step": 228 }, { "epoch": 0.720125786163522, "grad_norm": 0.041160885244607925, "learning_rate": 3.963923914773187e-05, "loss": 1.0453, "step": 229 }, { "epoch": 0.7232704402515723, "grad_norm": 0.038153354078531265, "learning_rate": 3.8815853536798904e-05, "loss": 1.0438, "step": 230 }, { "epoch": 0.7264150943396226, "grad_norm": 0.039117470383644104, "learning_rate": 3.79990452539225e-05, "loss": 1.0131, "step": 231 }, { "epoch": 0.7295597484276729, "grad_norm": 0.037614606320858, "learning_rate": 3.7188902106424416e-05, "loss": 1.0308, "step": 232 }, { "epoch": 0.7327044025157232, "grad_norm": 0.03742281720042229, "learning_rate": 3.638551118512089e-05, "loss": 1.0343, "step": 233 }, { "epoch": 0.7358490566037735, "grad_norm": 0.040659379214048386, "learning_rate": 3.558895885496023e-05, "loss": 1.0206, "step": 234 }, { "epoch": 0.7389937106918238, "grad_norm": 0.039581410586833954, "learning_rate": 3.479933074573858e-05, "loss": 1.0209, "step": 235 }, { "epoch": 0.7421383647798742, "grad_norm": 0.03877450153231621, "learning_rate": 3.401671174289469e-05, "loss": 1.0242, "step": 236 }, { "epoch": 0.7452830188679245, "grad_norm": 0.03689349815249443, "learning_rate": 3.324118597838464e-05, "loss": 1.0064, "step": 237 }, { "epoch": 0.7484276729559748, "grad_norm": 0.039353396743535995, "learning_rate": 3.2472836821637744e-05, "loss": 1.0392, "step": 238 }, { "epoch": 0.7515723270440252, "grad_norm": 0.04024632275104523, "learning_rate": 3.1711746870594086e-05, "loss": 1.0398, "step": 239 }, { "epoch": 0.7547169811320755, "grad_norm": 0.0384189747273922, "learning_rate": 3.0957997942825336e-05, "loss": 1.0508, "step": 240 }, { "epoch": 0.7578616352201258, "grad_norm": 0.038072239607572556, "learning_rate": 3.021167106673928e-05, "loss": 1.0274, "step": 241 }, { "epoch": 0.7610062893081762, "grad_norm": 0.03652197867631912, "learning_rate": 2.9472846472869298e-05, "loss": 1.0091, "step": 242 }, { "epoch": 0.7641509433962265, "grad_norm": 0.04008382558822632, "learning_rate": 2.874160358524931e-05, "loss": 1.0118, "step": 243 }, { "epoch": 0.7672955974842768, "grad_norm": 0.038193073123693466, "learning_rate": 2.8018021012875994e-05, "loss": 1.0492, "step": 244 }, { "epoch": 0.7704402515723271, "grad_norm": 0.04008280113339424, "learning_rate": 2.7302176541257986e-05, "loss": 1.0087, "step": 245 }, { "epoch": 0.7735849056603774, "grad_norm": 0.040726155042648315, "learning_rate": 2.659414712405398e-05, "loss": 1.0427, "step": 246 }, { "epoch": 0.7767295597484277, "grad_norm": 0.03964506462216377, "learning_rate": 2.5894008874800325e-05, "loss": 1.0377, "step": 247 }, { "epoch": 0.779874213836478, "grad_norm": 0.03894224017858505, "learning_rate": 2.5201837058728505e-05, "loss": 1.0362, "step": 248 }, { "epoch": 0.7830188679245284, "grad_norm": 0.038798924535512924, "learning_rate": 2.451770608467432e-05, "loss": 1.0383, "step": 249 }, { "epoch": 0.7861635220125787, "grad_norm": 0.03763001784682274, "learning_rate": 2.3841689497078746e-05, "loss": 1.0488, "step": 250 }, { "epoch": 0.789308176100629, "grad_norm": 0.04090484231710434, "learning_rate": 2.3173859968081944e-05, "loss": 1.0297, "step": 251 }, { "epoch": 0.7924528301886793, "grad_norm": 0.039545051753520966, "learning_rate": 2.251428928971102e-05, "loss": 1.0396, "step": 252 }, { "epoch": 0.7955974842767296, "grad_norm": 0.037017423659563065, "learning_rate": 2.1863048366162208e-05, "loss": 1.0178, "step": 253 }, { "epoch": 0.7987421383647799, "grad_norm": 0.03963112458586693, "learning_rate": 2.1220207206178688e-05, "loss": 1.025, "step": 254 }, { "epoch": 0.8018867924528302, "grad_norm": 0.03978583589196205, "learning_rate": 2.058583491552465e-05, "loss": 1.0226, "step": 255 }, { "epoch": 0.8050314465408805, "grad_norm": 0.03923904895782471, "learning_rate": 1.995999968955641e-05, "loss": 1.0291, "step": 256 }, { "epoch": 0.8081761006289309, "grad_norm": 0.03717755898833275, "learning_rate": 1.9342768805891178e-05, "loss": 1.0262, "step": 257 }, { "epoch": 0.8113207547169812, "grad_norm": 0.03690655902028084, "learning_rate": 1.8734208617174988e-05, "loss": 1.0263, "step": 258 }, { "epoch": 0.8144654088050315, "grad_norm": 0.038003891706466675, "learning_rate": 1.8134384543949478e-05, "loss": 1.0279, "step": 259 }, { "epoch": 0.8176100628930818, "grad_norm": 0.037383392453193665, "learning_rate": 1.754336106761927e-05, "loss": 1.0184, "step": 260 }, { "epoch": 0.8207547169811321, "grad_norm": 0.038551997393369675, "learning_rate": 1.696120172352025e-05, "loss": 1.055, "step": 261 }, { "epoch": 0.8238993710691824, "grad_norm": 0.03848763927817345, "learning_rate": 1.6387969094089316e-05, "loss": 1.0276, "step": 262 }, { "epoch": 0.8270440251572327, "grad_norm": 0.03697813302278519, "learning_rate": 1.5823724802136865e-05, "loss": 1.0107, "step": 263 }, { "epoch": 0.8301886792452831, "grad_norm": 0.039934322237968445, "learning_rate": 1.526852950422226e-05, "loss": 1.0184, "step": 264 }, { "epoch": 0.8333333333333334, "grad_norm": 0.04363315552473068, "learning_rate": 1.4722442884133214e-05, "loss": 0.9912, "step": 265 }, { "epoch": 0.8364779874213837, "grad_norm": 0.04497281834483147, "learning_rate": 1.4185523646469822e-05, "loss": 1.0578, "step": 266 }, { "epoch": 0.839622641509434, "grad_norm": 0.03638835996389389, "learning_rate": 1.3657829510333654e-05, "loss": 1.0259, "step": 267 }, { "epoch": 0.8427672955974843, "grad_norm": 0.0390971377491951, "learning_rate": 1.3139417203123027e-05, "loss": 1.0188, "step": 268 }, { "epoch": 0.8459119496855346, "grad_norm": 0.036897242069244385, "learning_rate": 1.263034245443473e-05, "loss": 1.0333, "step": 269 }, { "epoch": 0.8490566037735849, "grad_norm": 0.037717305123806, "learning_rate": 1.2130659990073146e-05, "loss": 1.0319, "step": 270 }, { "epoch": 0.8522012578616353, "grad_norm": 0.038260139524936676, "learning_rate": 1.1640423526166988e-05, "loss": 1.0063, "step": 271 }, { "epoch": 0.8553459119496856, "grad_norm": 0.04040497913956642, "learning_rate": 1.1159685763395111e-05, "loss": 1.01, "step": 272 }, { "epoch": 0.8584905660377359, "grad_norm": 0.036462146788835526, "learning_rate": 1.0688498381320855e-05, "loss": 1.0137, "step": 273 }, { "epoch": 0.8616352201257862, "grad_norm": 0.03783508017659187, "learning_rate": 1.0226912032836611e-05, "loss": 1.01, "step": 274 }, { "epoch": 0.8647798742138365, "grad_norm": 0.036553751677274704, "learning_rate": 9.774976338718677e-06, "loss": 1.035, "step": 275 }, { "epoch": 0.8679245283018868, "grad_norm": 0.038083869963884354, "learning_rate": 9.332739882292752e-06, "loss": 1.0514, "step": 276 }, { "epoch": 0.8710691823899371, "grad_norm": 0.036774635314941406, "learning_rate": 8.900250204211514e-06, "loss": 1.0211, "step": 277 }, { "epoch": 0.8742138364779874, "grad_norm": 0.038534294813871384, "learning_rate": 8.47755379734373e-06, "loss": 1.011, "step": 278 }, { "epoch": 0.8773584905660378, "grad_norm": 0.036409780383110046, "learning_rate": 8.064696101776358e-06, "loss": 1.0247, "step": 279 }, { "epoch": 0.8805031446540881, "grad_norm": 0.04032037407159805, "learning_rate": 7.661721499929753e-06, "loss": 1.0205, "step": 280 }, { "epoch": 0.8836477987421384, "grad_norm": 0.03601597249507904, "learning_rate": 7.2686733117863784e-06, "loss": 1.0278, "step": 281 }, { "epoch": 0.8867924528301887, "grad_norm": 0.03768506646156311, "learning_rate": 6.8855937902340576e-06, "loss": 1.0256, "step": 282 }, { "epoch": 0.889937106918239, "grad_norm": 0.0377877801656723, "learning_rate": 6.512524116523633e-06, "loss": 1.0238, "step": 283 }, { "epoch": 0.8930817610062893, "grad_norm": 0.038199830800294876, "learning_rate": 6.149504395842087e-06, "loss": 1.0335, "step": 284 }, { "epoch": 0.8962264150943396, "grad_norm": 0.03672681748867035, "learning_rate": 5.7965736530010916e-06, "loss": 1.0089, "step": 285 }, { "epoch": 0.89937106918239, "grad_norm": 0.03878109157085419, "learning_rate": 5.453769828241872e-06, "loss": 1.0007, "step": 286 }, { "epoch": 0.9025157232704403, "grad_norm": 0.03794073313474655, "learning_rate": 5.121129773156663e-06, "loss": 1.0507, "step": 287 }, { "epoch": 0.9056603773584906, "grad_norm": 0.03933648765087128, "learning_rate": 4.798689246727006e-06, "loss": 1.0266, "step": 288 }, { "epoch": 0.9088050314465409, "grad_norm": 0.03670027107000351, "learning_rate": 4.486482911479839e-06, "loss": 1.0367, "step": 289 }, { "epoch": 0.9119496855345912, "grad_norm": 0.037638451904058456, "learning_rate": 4.184544329761009e-06, "loss": 1.0401, "step": 290 }, { "epoch": 0.9150943396226415, "grad_norm": 0.03804009407758713, "learning_rate": 3.892905960127546e-06, "loss": 0.9959, "step": 291 }, { "epoch": 0.9182389937106918, "grad_norm": 0.04068181291222572, "learning_rate": 3.611599153858214e-06, "loss": 1.0631, "step": 292 }, { "epoch": 0.9213836477987422, "grad_norm": 0.036831971257925034, "learning_rate": 3.3406541515832003e-06, "loss": 1.0072, "step": 293 }, { "epoch": 0.9245283018867925, "grad_norm": 0.03732535243034363, "learning_rate": 3.0801000800333877e-06, "loss": 1.0091, "step": 294 }, { "epoch": 0.9276729559748428, "grad_norm": 0.03764468804001808, "learning_rate": 2.8299649489090475e-06, "loss": 1.03, "step": 295 }, { "epoch": 0.9308176100628931, "grad_norm": 0.03870733082294464, "learning_rate": 2.590275647868867e-06, "loss": 1.0281, "step": 296 }, { "epoch": 0.9339622641509434, "grad_norm": 0.03789420798420906, "learning_rate": 2.3610579436393e-06, "loss": 1.0479, "step": 297 }, { "epoch": 0.9371069182389937, "grad_norm": 0.039737775921821594, "learning_rate": 2.1423364772445887e-06, "loss": 1.0584, "step": 298 }, { "epoch": 0.940251572327044, "grad_norm": 0.03715479001402855, "learning_rate": 1.9341347613579087e-06, "loss": 1.0168, "step": 299 }, { "epoch": 0.9433962264150944, "grad_norm": 0.03719014674425125, "learning_rate": 1.7364751777736332e-06, "loss": 0.9956, "step": 300 }, { "epoch": 0.9465408805031447, "grad_norm": 0.03842631354928017, "learning_rate": 1.5493789750014031e-06, "loss": 1.0437, "step": 301 }, { "epoch": 0.949685534591195, "grad_norm": 0.038147032260894775, "learning_rate": 1.3728662659818204e-06, "loss": 1.033, "step": 302 }, { "epoch": 0.9528301886792453, "grad_norm": 0.038387030363082886, "learning_rate": 1.2069560259243328e-06, "loss": 1.0002, "step": 303 }, { "epoch": 0.9559748427672956, "grad_norm": 0.03752262517809868, "learning_rate": 1.0516660902673448e-06, "loss": 1.0258, "step": 304 }, { "epoch": 0.9591194968553459, "grad_norm": 0.03641341254115105, "learning_rate": 9.070131527609604e-07, "loss": 1.0259, "step": 305 }, { "epoch": 0.9622641509433962, "grad_norm": 0.03814227133989334, "learning_rate": 7.730127636723539e-07, "loss": 0.9939, "step": 306 }, { "epoch": 0.9654088050314465, "grad_norm": 0.03757226839661598, "learning_rate": 6.496793281141056e-07, "loss": 0.988, "step": 307 }, { "epoch": 0.9685534591194969, "grad_norm": 0.03574439138174057, "learning_rate": 5.370261044956971e-07, "loss": 1.0257, "step": 308 }, { "epoch": 0.9716981132075472, "grad_norm": 0.03881550952792168, "learning_rate": 4.3506520309813947e-07, "loss": 1.0399, "step": 309 }, { "epoch": 0.9748427672955975, "grad_norm": 0.03827887400984764, "learning_rate": 3.4380758477219333e-07, "loss": 1.0163, "step": 310 }, { "epoch": 0.9779874213836478, "grad_norm": 0.03611140325665474, "learning_rate": 2.6326305976001055e-07, "loss": 1.0014, "step": 311 }, { "epoch": 0.9811320754716981, "grad_norm": 0.037388019263744354, "learning_rate": 1.9344028664056713e-07, "loss": 1.0223, "step": 312 }, { "epoch": 0.9842767295597484, "grad_norm": 0.041428446769714355, "learning_rate": 1.3434677139885222e-07, "loss": 1.0327, "step": 313 }, { "epoch": 0.9874213836477987, "grad_norm": 0.036151085048913956, "learning_rate": 8.598886661895788e-08, "loss": 1.0306, "step": 314 }, { "epoch": 0.9905660377358491, "grad_norm": 0.03780834376811981, "learning_rate": 4.837177080119215e-08, "loss": 1.0196, "step": 315 }, { "epoch": 0.9937106918238994, "grad_norm": 0.039815668016672134, "learning_rate": 2.1499527803214846e-08, "loss": 1.045, "step": 316 }, { "epoch": 0.9968553459119497, "grad_norm": 0.03660481423139572, "learning_rate": 5.375026405352035e-09, "loss": 1.016, "step": 317 }, { "epoch": 1.0, "grad_norm": 0.03775152564048767, "learning_rate": 0.0, "loss": 1.015, "step": 318 }, { "epoch": 1.0, "eval_loss": 1.0362061262130737, "eval_runtime": 856.4472, "eval_samples_per_second": 29.006, "eval_steps_per_second": 3.627, "step": 318 } ], "logging_steps": 1, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9003005260988416e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }