{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20329335230737955, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004065867046147591, "grad_norm": 0.22144322097301483, "learning_rate": 0.0, "loss": 1.3598, "step": 1 }, { "epoch": 0.0008131734092295182, "grad_norm": 0.199473574757576, "learning_rate": 4e-05, "loss": 1.405, "step": 2 }, { "epoch": 0.0012197601138442774, "grad_norm": 0.20758001506328583, "learning_rate": 8e-05, "loss": 1.2815, "step": 3 }, { "epoch": 0.0016263468184590363, "grad_norm": 0.21362783014774323, "learning_rate": 0.00012, "loss": 1.245, "step": 4 }, { "epoch": 0.0020329335230737954, "grad_norm": 0.24631692469120026, "learning_rate": 0.00016, "loss": 1.3086, "step": 5 }, { "epoch": 0.002439520227688555, "grad_norm": 0.20009225606918335, "learning_rate": 0.0002, "loss": 1.2443, "step": 6 }, { "epoch": 0.0028461069323033137, "grad_norm": 0.1735246330499649, "learning_rate": 0.00019995929167514756, "loss": 1.1878, "step": 7 }, { "epoch": 0.0032526936369180726, "grad_norm": 0.18904437124729156, "learning_rate": 0.00019991858335029514, "loss": 1.2478, "step": 8 }, { "epoch": 0.003659280341532832, "grad_norm": 0.1645248979330063, "learning_rate": 0.0001998778750254427, "loss": 1.2098, "step": 9 }, { "epoch": 0.004065867046147591, "grad_norm": 0.22034819424152374, "learning_rate": 0.00019983716670059028, "loss": 1.1183, "step": 10 }, { "epoch": 0.00447245375076235, "grad_norm": 0.3233634829521179, "learning_rate": 0.00019979645837573783, "loss": 1.0974, "step": 11 }, { "epoch": 0.00487904045537711, "grad_norm": 0.2592090368270874, "learning_rate": 0.00019975575005088542, "loss": 1.1611, "step": 12 }, { "epoch": 0.005285627159991868, "grad_norm": 0.14754348993301392, "learning_rate": 0.000199715041726033, "loss": 1.1932, "step": 13 }, { "epoch": 0.005692213864606627, "grad_norm": 0.09341374039649963, "learning_rate": 0.00019967433340118055, "loss": 1.348, "step": 14 }, { "epoch": 0.006098800569221387, "grad_norm": 0.10229193419218063, "learning_rate": 0.00019963362507632813, "loss": 1.0927, "step": 15 }, { "epoch": 0.006505387273836145, "grad_norm": 0.14015386998653412, "learning_rate": 0.00019959291675147569, "loss": 1.2263, "step": 16 }, { "epoch": 0.006911973978450905, "grad_norm": 0.17507047951221466, "learning_rate": 0.00019955220842662327, "loss": 1.1951, "step": 17 }, { "epoch": 0.007318560683065664, "grad_norm": 0.17176274955272675, "learning_rate": 0.00019951150010177082, "loss": 1.1895, "step": 18 }, { "epoch": 0.007725147387680423, "grad_norm": 0.13839803636074066, "learning_rate": 0.00019947079177691838, "loss": 0.9549, "step": 19 }, { "epoch": 0.008131734092295182, "grad_norm": 0.0970696285367012, "learning_rate": 0.00019943008345206596, "loss": 1.0867, "step": 20 }, { "epoch": 0.008538320796909941, "grad_norm": 0.08836886286735535, "learning_rate": 0.0001993893751272135, "loss": 1.155, "step": 21 }, { "epoch": 0.0089449075015247, "grad_norm": 0.11885025352239609, "learning_rate": 0.0001993486668023611, "loss": 1.1231, "step": 22 }, { "epoch": 0.00935149420613946, "grad_norm": 0.15120816230773926, "learning_rate": 0.00019930795847750865, "loss": 1.1078, "step": 23 }, { "epoch": 0.00975808091075422, "grad_norm": 0.16326424479484558, "learning_rate": 0.00019926725015265623, "loss": 1.079, "step": 24 }, { "epoch": 0.010164667615368977, "grad_norm": 0.1179085448384285, "learning_rate": 0.0001992265418278038, "loss": 0.932, "step": 25 }, { "epoch": 0.010571254319983736, "grad_norm": 0.10621985793113708, "learning_rate": 0.00019918583350295136, "loss": 1.1386, "step": 26 }, { "epoch": 0.010977841024598495, "grad_norm": 0.08408638089895248, "learning_rate": 0.00019914512517809894, "loss": 1.0987, "step": 27 }, { "epoch": 0.011384427729213255, "grad_norm": 0.08222135156393051, "learning_rate": 0.0001991044168532465, "loss": 1.0378, "step": 28 }, { "epoch": 0.011791014433828014, "grad_norm": 0.08763129264116287, "learning_rate": 0.00019906370852839408, "loss": 0.983, "step": 29 }, { "epoch": 0.012197601138442773, "grad_norm": 0.10638878494501114, "learning_rate": 0.00019902300020354163, "loss": 1.0258, "step": 30 }, { "epoch": 0.012604187843057533, "grad_norm": 0.10155023634433746, "learning_rate": 0.0001989822918786892, "loss": 0.9579, "step": 31 }, { "epoch": 0.01301077454767229, "grad_norm": 0.08844579011201859, "learning_rate": 0.00019894158355383677, "loss": 1.1007, "step": 32 }, { "epoch": 0.01341736125228705, "grad_norm": 0.10394158959388733, "learning_rate": 0.00019890087522898432, "loss": 1.0459, "step": 33 }, { "epoch": 0.01382394795690181, "grad_norm": 0.08938682824373245, "learning_rate": 0.0001988601669041319, "loss": 1.0985, "step": 34 }, { "epoch": 0.014230534661516568, "grad_norm": 0.08639086782932281, "learning_rate": 0.00019881945857927948, "loss": 1.0712, "step": 35 }, { "epoch": 0.014637121366131328, "grad_norm": 0.08568435162305832, "learning_rate": 0.00019877875025442704, "loss": 1.0549, "step": 36 }, { "epoch": 0.015043708070746087, "grad_norm": 0.0859316885471344, "learning_rate": 0.00019873804192957462, "loss": 1.1042, "step": 37 }, { "epoch": 0.015450294775360847, "grad_norm": 0.09534381330013275, "learning_rate": 0.00019869733360472217, "loss": 1.0127, "step": 38 }, { "epoch": 0.015856881479975604, "grad_norm": 0.09103580564260483, "learning_rate": 0.00019865662527986976, "loss": 0.9347, "step": 39 }, { "epoch": 0.016263468184590364, "grad_norm": 0.0928095132112503, "learning_rate": 0.0001986159169550173, "loss": 1.0559, "step": 40 }, { "epoch": 0.016670054889205123, "grad_norm": 0.09370871633291245, "learning_rate": 0.0001985752086301649, "loss": 1.1473, "step": 41 }, { "epoch": 0.017076641593819882, "grad_norm": 0.07691123336553574, "learning_rate": 0.00019853450030531244, "loss": 1.0128, "step": 42 }, { "epoch": 0.01748322829843464, "grad_norm": 0.09201047569513321, "learning_rate": 0.00019849379198046, "loss": 1.1296, "step": 43 }, { "epoch": 0.0178898150030494, "grad_norm": 0.08490074425935745, "learning_rate": 0.00019845308365560758, "loss": 1.0444, "step": 44 }, { "epoch": 0.01829640170766416, "grad_norm": 0.08623114228248596, "learning_rate": 0.00019841237533075513, "loss": 1.066, "step": 45 }, { "epoch": 0.01870298841227892, "grad_norm": 0.09486474096775055, "learning_rate": 0.00019837166700590271, "loss": 1.0788, "step": 46 }, { "epoch": 0.01910957511689368, "grad_norm": 0.08024484664201736, "learning_rate": 0.0001983309586810503, "loss": 1.0262, "step": 47 }, { "epoch": 0.01951616182150844, "grad_norm": 0.09256327897310257, "learning_rate": 0.00019829025035619785, "loss": 1.107, "step": 48 }, { "epoch": 0.019922748526123194, "grad_norm": 0.09877921640872955, "learning_rate": 0.00019824954203134543, "loss": 1.1731, "step": 49 }, { "epoch": 0.020329335230737954, "grad_norm": 0.08699575811624527, "learning_rate": 0.00019820883370649299, "loss": 1.0809, "step": 50 }, { "epoch": 0.020735921935352713, "grad_norm": 0.089649498462677, "learning_rate": 0.00019816812538164057, "loss": 1.1564, "step": 51 }, { "epoch": 0.021142508639967472, "grad_norm": 0.08757214993238449, "learning_rate": 0.00019812741705678812, "loss": 1.0272, "step": 52 }, { "epoch": 0.02154909534458223, "grad_norm": 0.08320939540863037, "learning_rate": 0.0001980867087319357, "loss": 0.9931, "step": 53 }, { "epoch": 0.02195568204919699, "grad_norm": 0.08898070454597473, "learning_rate": 0.00019804600040708326, "loss": 0.9421, "step": 54 }, { "epoch": 0.02236226875381175, "grad_norm": 0.08072236180305481, "learning_rate": 0.0001980052920822308, "loss": 1.0304, "step": 55 }, { "epoch": 0.02276885545842651, "grad_norm": 0.09354112297296524, "learning_rate": 0.0001979645837573784, "loss": 1.1041, "step": 56 }, { "epoch": 0.02317544216304127, "grad_norm": 0.09214304387569427, "learning_rate": 0.00019792387543252595, "loss": 1.0666, "step": 57 }, { "epoch": 0.02358202886765603, "grad_norm": 0.08546210825443268, "learning_rate": 0.00019788316710767353, "loss": 1.0795, "step": 58 }, { "epoch": 0.023988615572270788, "grad_norm": 0.09029046446084976, "learning_rate": 0.0001978424587828211, "loss": 1.199, "step": 59 }, { "epoch": 0.024395202276885547, "grad_norm": 0.08200937509536743, "learning_rate": 0.00019780175045796866, "loss": 0.9853, "step": 60 }, { "epoch": 0.024801788981500306, "grad_norm": 0.08928566426038742, "learning_rate": 0.00019776104213311624, "loss": 0.9948, "step": 61 }, { "epoch": 0.025208375686115066, "grad_norm": 0.08067034929990768, "learning_rate": 0.0001977203338082638, "loss": 0.9824, "step": 62 }, { "epoch": 0.02561496239072982, "grad_norm": 0.07509499788284302, "learning_rate": 0.00019767962548341138, "loss": 0.9166, "step": 63 }, { "epoch": 0.02602154909534458, "grad_norm": 0.10127029567956924, "learning_rate": 0.00019763891715855893, "loss": 0.978, "step": 64 }, { "epoch": 0.02642813579995934, "grad_norm": 0.08480218052864075, "learning_rate": 0.0001975982088337065, "loss": 1.0019, "step": 65 }, { "epoch": 0.0268347225045741, "grad_norm": 0.0922696441411972, "learning_rate": 0.00019755750050885407, "loss": 1.0213, "step": 66 }, { "epoch": 0.02724130920918886, "grad_norm": 0.0819278433918953, "learning_rate": 0.00019751679218400162, "loss": 0.9792, "step": 67 }, { "epoch": 0.02764789591380362, "grad_norm": 0.09971120208501816, "learning_rate": 0.0001974760838591492, "loss": 0.9605, "step": 68 }, { "epoch": 0.028054482618418378, "grad_norm": 0.09195531904697418, "learning_rate": 0.00019743537553429676, "loss": 1.1203, "step": 69 }, { "epoch": 0.028461069323033137, "grad_norm": 0.09179981052875519, "learning_rate": 0.00019739466720944434, "loss": 1.0586, "step": 70 }, { "epoch": 0.028867656027647896, "grad_norm": 0.0866156816482544, "learning_rate": 0.00019735395888459192, "loss": 1.0558, "step": 71 }, { "epoch": 0.029274242732262656, "grad_norm": 0.09198956191539764, "learning_rate": 0.00019731325055973947, "loss": 1.117, "step": 72 }, { "epoch": 0.029680829436877415, "grad_norm": 0.0912180244922638, "learning_rate": 0.00019727254223488705, "loss": 1.0235, "step": 73 }, { "epoch": 0.030087416141492174, "grad_norm": 0.092186838388443, "learning_rate": 0.0001972318339100346, "loss": 1.0119, "step": 74 }, { "epoch": 0.030494002846106934, "grad_norm": 0.091013602912426, "learning_rate": 0.0001971911255851822, "loss": 1.0523, "step": 75 }, { "epoch": 0.030900589550721693, "grad_norm": 0.0932595282793045, "learning_rate": 0.00019715041726032974, "loss": 1.0471, "step": 76 }, { "epoch": 0.03130717625533645, "grad_norm": 0.089345782995224, "learning_rate": 0.0001971097089354773, "loss": 1.0214, "step": 77 }, { "epoch": 0.03171376295995121, "grad_norm": 0.09476006776094437, "learning_rate": 0.00019706900061062488, "loss": 0.9888, "step": 78 }, { "epoch": 0.03212034966456597, "grad_norm": 0.09379832446575165, "learning_rate": 0.00019702829228577243, "loss": 1.1039, "step": 79 }, { "epoch": 0.03252693636918073, "grad_norm": 0.10659569501876831, "learning_rate": 0.00019698758396092001, "loss": 1.1377, "step": 80 }, { "epoch": 0.03293352307379549, "grad_norm": 0.09652398526668549, "learning_rate": 0.0001969468756360676, "loss": 1.0194, "step": 81 }, { "epoch": 0.033340109778410246, "grad_norm": 0.08641666918992996, "learning_rate": 0.00019690616731121515, "loss": 1.0239, "step": 82 }, { "epoch": 0.03374669648302501, "grad_norm": 0.0956072062253952, "learning_rate": 0.00019686545898636273, "loss": 1.032, "step": 83 }, { "epoch": 0.034153283187639764, "grad_norm": 0.08402691036462784, "learning_rate": 0.00019682475066151029, "loss": 0.9802, "step": 84 }, { "epoch": 0.03455986989225452, "grad_norm": 0.08827648311853409, "learning_rate": 0.00019678404233665787, "loss": 1.1805, "step": 85 }, { "epoch": 0.03496645659686928, "grad_norm": 0.08757660537958145, "learning_rate": 0.00019674333401180542, "loss": 0.952, "step": 86 }, { "epoch": 0.03537304330148404, "grad_norm": 0.09728538244962692, "learning_rate": 0.000196702625686953, "loss": 1.0875, "step": 87 }, { "epoch": 0.0357796300060988, "grad_norm": 0.08561044931411743, "learning_rate": 0.00019666191736210056, "loss": 0.9818, "step": 88 }, { "epoch": 0.03618621671071356, "grad_norm": 0.08389468491077423, "learning_rate": 0.0001966212090372481, "loss": 0.9962, "step": 89 }, { "epoch": 0.03659280341532832, "grad_norm": 0.08847957849502563, "learning_rate": 0.0001965805007123957, "loss": 1.0138, "step": 90 }, { "epoch": 0.036999390119943076, "grad_norm": 0.08515489101409912, "learning_rate": 0.00019653979238754324, "loss": 1.0119, "step": 91 }, { "epoch": 0.03740597682455784, "grad_norm": 0.09340325742959976, "learning_rate": 0.00019649908406269083, "loss": 1.0635, "step": 92 }, { "epoch": 0.037812563529172595, "grad_norm": 0.09383916854858398, "learning_rate": 0.0001964583757378384, "loss": 1.0999, "step": 93 }, { "epoch": 0.03821915023378736, "grad_norm": 0.09956547617912292, "learning_rate": 0.00019641766741298596, "loss": 1.0186, "step": 94 }, { "epoch": 0.038625736938402114, "grad_norm": 0.09809234738349915, "learning_rate": 0.00019637695908813354, "loss": 1.0641, "step": 95 }, { "epoch": 0.03903232364301688, "grad_norm": 0.08520065993070602, "learning_rate": 0.0001963362507632811, "loss": 0.9255, "step": 96 }, { "epoch": 0.03943891034763163, "grad_norm": 0.09007880836725235, "learning_rate": 0.00019629554243842868, "loss": 1.0963, "step": 97 }, { "epoch": 0.03984549705224639, "grad_norm": 0.08900373429059982, "learning_rate": 0.00019625483411357623, "loss": 0.9908, "step": 98 }, { "epoch": 0.04025208375686115, "grad_norm": 0.09613076597452164, "learning_rate": 0.0001962141257887238, "loss": 0.9729, "step": 99 }, { "epoch": 0.04065867046147591, "grad_norm": 0.09987878054380417, "learning_rate": 0.00019617341746387137, "loss": 1.0554, "step": 100 }, { "epoch": 0.04106525716609067, "grad_norm": 0.10209144651889801, "learning_rate": 0.00019613270913901892, "loss": 1.1162, "step": 101 }, { "epoch": 0.041471843870705426, "grad_norm": 0.10085388273000717, "learning_rate": 0.0001960920008141665, "loss": 1.1355, "step": 102 }, { "epoch": 0.04187843057532019, "grad_norm": 0.08966121822595596, "learning_rate": 0.00019605129248931406, "loss": 0.9275, "step": 103 }, { "epoch": 0.042285017279934944, "grad_norm": 0.10507562756538391, "learning_rate": 0.00019601058416446166, "loss": 1.081, "step": 104 }, { "epoch": 0.04269160398454971, "grad_norm": 0.09719648957252502, "learning_rate": 0.00019596987583960922, "loss": 1.0884, "step": 105 }, { "epoch": 0.04309819068916446, "grad_norm": 0.09457529336214066, "learning_rate": 0.00019592916751475677, "loss": 1.0413, "step": 106 }, { "epoch": 0.043504777393779226, "grad_norm": 0.11330179125070572, "learning_rate": 0.00019588845918990435, "loss": 1.0937, "step": 107 }, { "epoch": 0.04391136409839398, "grad_norm": 0.09778840839862823, "learning_rate": 0.0001958477508650519, "loss": 1.1316, "step": 108 }, { "epoch": 0.044317950803008745, "grad_norm": 0.09848835319280624, "learning_rate": 0.0001958070425401995, "loss": 1.1244, "step": 109 }, { "epoch": 0.0447245375076235, "grad_norm": 0.0965428277850151, "learning_rate": 0.00019576633421534704, "loss": 0.9952, "step": 110 }, { "epoch": 0.045131124212238256, "grad_norm": 0.0857444629073143, "learning_rate": 0.00019572562589049462, "loss": 0.9822, "step": 111 }, { "epoch": 0.04553771091685302, "grad_norm": 0.10461942851543427, "learning_rate": 0.00019568491756564218, "loss": 1.1463, "step": 112 }, { "epoch": 0.045944297621467775, "grad_norm": 0.08575154095888138, "learning_rate": 0.00019564420924078973, "loss": 0.8976, "step": 113 }, { "epoch": 0.04635088432608254, "grad_norm": 0.0948256254196167, "learning_rate": 0.00019560350091593731, "loss": 1.1205, "step": 114 }, { "epoch": 0.046757471030697294, "grad_norm": 0.09214090555906296, "learning_rate": 0.00019556279259108487, "loss": 1.1416, "step": 115 }, { "epoch": 0.04716405773531206, "grad_norm": 0.09885852038860321, "learning_rate": 0.00019552208426623248, "loss": 1.079, "step": 116 }, { "epoch": 0.04757064443992681, "grad_norm": 0.09071148931980133, "learning_rate": 0.00019548137594138003, "loss": 1.0128, "step": 117 }, { "epoch": 0.047977231144541575, "grad_norm": 0.09190430492162704, "learning_rate": 0.00019544066761652758, "loss": 0.9631, "step": 118 }, { "epoch": 0.04838381784915633, "grad_norm": 0.08024870604276657, "learning_rate": 0.00019539995929167517, "loss": 0.9086, "step": 119 }, { "epoch": 0.048790404553771094, "grad_norm": 0.09223239868879318, "learning_rate": 0.00019535925096682272, "loss": 1.0255, "step": 120 }, { "epoch": 0.04919699125838585, "grad_norm": 0.09259685128927231, "learning_rate": 0.0001953185426419703, "loss": 1.0221, "step": 121 }, { "epoch": 0.04960357796300061, "grad_norm": 0.08371948450803757, "learning_rate": 0.00019527783431711786, "loss": 0.966, "step": 122 }, { "epoch": 0.05001016466761537, "grad_norm": 0.0957912728190422, "learning_rate": 0.00019523712599226544, "loss": 1.0919, "step": 123 }, { "epoch": 0.05041675137223013, "grad_norm": 0.09397678077220917, "learning_rate": 0.000195196417667413, "loss": 0.9666, "step": 124 }, { "epoch": 0.05082333807684489, "grad_norm": 0.1014254167675972, "learning_rate": 0.00019515570934256054, "loss": 0.9321, "step": 125 }, { "epoch": 0.05122992478145964, "grad_norm": 0.09339801222085953, "learning_rate": 0.00019511500101770813, "loss": 1.0487, "step": 126 }, { "epoch": 0.051636511486074406, "grad_norm": 0.08642175793647766, "learning_rate": 0.0001950742926928557, "loss": 1.0606, "step": 127 }, { "epoch": 0.05204309819068916, "grad_norm": 0.09092641621828079, "learning_rate": 0.0001950335843680033, "loss": 0.904, "step": 128 }, { "epoch": 0.052449684895303925, "grad_norm": 0.09896791726350784, "learning_rate": 0.00019499287604315084, "loss": 1.0325, "step": 129 }, { "epoch": 0.05285627159991868, "grad_norm": 0.08731307834386826, "learning_rate": 0.0001949521677182984, "loss": 0.9258, "step": 130 }, { "epoch": 0.05326285830453344, "grad_norm": 0.09673330187797546, "learning_rate": 0.00019491145939344598, "loss": 1.1198, "step": 131 }, { "epoch": 0.0536694450091482, "grad_norm": 0.09038975089788437, "learning_rate": 0.00019487075106859353, "loss": 1.0295, "step": 132 }, { "epoch": 0.05407603171376296, "grad_norm": 0.0918399840593338, "learning_rate": 0.0001948300427437411, "loss": 1.0127, "step": 133 }, { "epoch": 0.05448261841837772, "grad_norm": 0.08970967680215836, "learning_rate": 0.00019478933441888867, "loss": 1.0238, "step": 134 }, { "epoch": 0.05488920512299248, "grad_norm": 0.09728217124938965, "learning_rate": 0.00019474862609403625, "loss": 1.069, "step": 135 }, { "epoch": 0.05529579182760724, "grad_norm": 0.10240956395864487, "learning_rate": 0.0001947079177691838, "loss": 1.1467, "step": 136 }, { "epoch": 0.055702378532222, "grad_norm": 0.10397852212190628, "learning_rate": 0.00019466720944433136, "loss": 1.0415, "step": 137 }, { "epoch": 0.056108965236836755, "grad_norm": 0.10451675951480865, "learning_rate": 0.00019462650111947894, "loss": 1.0309, "step": 138 }, { "epoch": 0.05651555194145151, "grad_norm": 0.09685720503330231, "learning_rate": 0.00019458579279462652, "loss": 1.11, "step": 139 }, { "epoch": 0.056922138646066274, "grad_norm": 0.09885822236537933, "learning_rate": 0.00019454508446977407, "loss": 0.993, "step": 140 }, { "epoch": 0.05732872535068103, "grad_norm": 0.10943586379289627, "learning_rate": 0.00019450437614492165, "loss": 0.9749, "step": 141 }, { "epoch": 0.05773531205529579, "grad_norm": 0.10964591801166534, "learning_rate": 0.0001944636678200692, "loss": 1.1108, "step": 142 }, { "epoch": 0.05814189875991055, "grad_norm": 0.10109028965234756, "learning_rate": 0.0001944229594952168, "loss": 1.0897, "step": 143 }, { "epoch": 0.05854848546452531, "grad_norm": 0.11243695765733719, "learning_rate": 0.00019438225117036434, "loss": 1.0338, "step": 144 }, { "epoch": 0.05895507216914007, "grad_norm": 0.1047658622264862, "learning_rate": 0.00019434154284551192, "loss": 0.9566, "step": 145 }, { "epoch": 0.05936165887375483, "grad_norm": 0.09534204006195068, "learning_rate": 0.00019430083452065948, "loss": 1.0313, "step": 146 }, { "epoch": 0.059768245578369586, "grad_norm": 0.10418044775724411, "learning_rate": 0.00019426012619580706, "loss": 0.9759, "step": 147 }, { "epoch": 0.06017483228298435, "grad_norm": 0.10020595043897629, "learning_rate": 0.00019421941787095461, "loss": 0.9368, "step": 148 }, { "epoch": 0.060581418987599105, "grad_norm": 0.09832129627466202, "learning_rate": 0.00019417870954610217, "loss": 1.0494, "step": 149 }, { "epoch": 0.06098800569221387, "grad_norm": 0.09458506107330322, "learning_rate": 0.00019413800122124978, "loss": 0.9631, "step": 150 }, { "epoch": 0.06139459239682862, "grad_norm": 0.10380101203918457, "learning_rate": 0.00019409729289639733, "loss": 1.1003, "step": 151 }, { "epoch": 0.061801179101443386, "grad_norm": 0.107131227850914, "learning_rate": 0.00019405658457154488, "loss": 1.0819, "step": 152 }, { "epoch": 0.06220776580605814, "grad_norm": 0.10330741852521896, "learning_rate": 0.00019401587624669247, "loss": 1.128, "step": 153 }, { "epoch": 0.0626143525106729, "grad_norm": 0.08829359710216522, "learning_rate": 0.00019397516792184002, "loss": 0.8754, "step": 154 }, { "epoch": 0.06302093921528766, "grad_norm": 0.10422427207231522, "learning_rate": 0.0001939344595969876, "loss": 0.9633, "step": 155 }, { "epoch": 0.06342752591990242, "grad_norm": 0.11499015986919403, "learning_rate": 0.00019389375127213515, "loss": 0.9735, "step": 156 }, { "epoch": 0.06383411262451717, "grad_norm": 0.0938427522778511, "learning_rate": 0.00019385304294728274, "loss": 0.9219, "step": 157 }, { "epoch": 0.06424069932913194, "grad_norm": 0.1080261766910553, "learning_rate": 0.0001938123346224303, "loss": 0.9678, "step": 158 }, { "epoch": 0.0646472860337467, "grad_norm": 0.10001271218061447, "learning_rate": 0.00019377162629757784, "loss": 1.0854, "step": 159 }, { "epoch": 0.06505387273836145, "grad_norm": 0.10731212794780731, "learning_rate": 0.00019373091797272543, "loss": 1.0108, "step": 160 }, { "epoch": 0.06546045944297621, "grad_norm": 0.10019373893737793, "learning_rate": 0.00019369020964787298, "loss": 1.0315, "step": 161 }, { "epoch": 0.06586704614759098, "grad_norm": 0.0947297066450119, "learning_rate": 0.0001936495013230206, "loss": 1.0634, "step": 162 }, { "epoch": 0.06627363285220574, "grad_norm": 0.12204254418611526, "learning_rate": 0.00019360879299816814, "loss": 1.0635, "step": 163 }, { "epoch": 0.06668021955682049, "grad_norm": 0.10462553054094315, "learning_rate": 0.0001935680846733157, "loss": 1.0248, "step": 164 }, { "epoch": 0.06708680626143525, "grad_norm": 0.09576130658388138, "learning_rate": 0.00019352737634846328, "loss": 0.9671, "step": 165 }, { "epoch": 0.06749339296605002, "grad_norm": 0.10027123987674713, "learning_rate": 0.00019348666802361083, "loss": 0.9317, "step": 166 }, { "epoch": 0.06789997967066477, "grad_norm": 0.10674256086349487, "learning_rate": 0.0001934459596987584, "loss": 1.0058, "step": 167 }, { "epoch": 0.06830656637527953, "grad_norm": 0.12352320551872253, "learning_rate": 0.00019340525137390597, "loss": 1.0926, "step": 168 }, { "epoch": 0.06871315307989428, "grad_norm": 0.09426864236593246, "learning_rate": 0.00019336454304905355, "loss": 1.0876, "step": 169 }, { "epoch": 0.06911973978450904, "grad_norm": 0.09280996024608612, "learning_rate": 0.0001933238347242011, "loss": 0.977, "step": 170 }, { "epoch": 0.06952632648912381, "grad_norm": 0.11547420918941498, "learning_rate": 0.00019328312639934866, "loss": 1.0598, "step": 171 }, { "epoch": 0.06993291319373857, "grad_norm": 0.12538915872573853, "learning_rate": 0.00019324241807449624, "loss": 1.0996, "step": 172 }, { "epoch": 0.07033949989835332, "grad_norm": 0.08110898733139038, "learning_rate": 0.00019320170974964382, "loss": 0.8776, "step": 173 }, { "epoch": 0.07074608660296808, "grad_norm": 0.10475198924541473, "learning_rate": 0.0001931610014247914, "loss": 1.0876, "step": 174 }, { "epoch": 0.07115267330758285, "grad_norm": 0.1095360517501831, "learning_rate": 0.00019312029309993895, "loss": 1.054, "step": 175 }, { "epoch": 0.0715592600121976, "grad_norm": 0.09516473114490509, "learning_rate": 0.0001930795847750865, "loss": 1.0558, "step": 176 }, { "epoch": 0.07196584671681236, "grad_norm": 0.09316466003656387, "learning_rate": 0.0001930388764502341, "loss": 0.9467, "step": 177 }, { "epoch": 0.07237243342142712, "grad_norm": 0.11777061969041824, "learning_rate": 0.00019299816812538164, "loss": 1.1441, "step": 178 }, { "epoch": 0.07277902012604189, "grad_norm": 0.09438811987638474, "learning_rate": 0.00019295745980052922, "loss": 0.9521, "step": 179 }, { "epoch": 0.07318560683065664, "grad_norm": 0.08892639726400375, "learning_rate": 0.00019291675147567678, "loss": 0.9804, "step": 180 }, { "epoch": 0.0735921935352714, "grad_norm": 0.08963356912136078, "learning_rate": 0.00019287604315082436, "loss": 1.0427, "step": 181 }, { "epoch": 0.07399878023988615, "grad_norm": 0.09870661795139313, "learning_rate": 0.0001928353348259719, "loss": 1.051, "step": 182 }, { "epoch": 0.07440536694450091, "grad_norm": 0.11843609809875488, "learning_rate": 0.00019279462650111947, "loss": 1.0109, "step": 183 }, { "epoch": 0.07481195364911568, "grad_norm": 0.08860404789447784, "learning_rate": 0.00019275391817626705, "loss": 1.0035, "step": 184 }, { "epoch": 0.07521854035373043, "grad_norm": 0.09085170924663544, "learning_rate": 0.00019271320985141463, "loss": 0.9461, "step": 185 }, { "epoch": 0.07562512705834519, "grad_norm": 0.09071815758943558, "learning_rate": 0.0001926725015265622, "loss": 0.9542, "step": 186 }, { "epoch": 0.07603171376295995, "grad_norm": 0.09566846489906311, "learning_rate": 0.00019263179320170976, "loss": 0.9958, "step": 187 }, { "epoch": 0.07643830046757472, "grad_norm": 0.11846338212490082, "learning_rate": 0.00019259108487685732, "loss": 1.0737, "step": 188 }, { "epoch": 0.07684488717218947, "grad_norm": 0.09295649081468582, "learning_rate": 0.0001925503765520049, "loss": 1.0162, "step": 189 }, { "epoch": 0.07725147387680423, "grad_norm": 0.0917876660823822, "learning_rate": 0.00019250966822715245, "loss": 1.0432, "step": 190 }, { "epoch": 0.07765806058141898, "grad_norm": 0.10864109545946121, "learning_rate": 0.00019246895990230004, "loss": 1.1107, "step": 191 }, { "epoch": 0.07806464728603375, "grad_norm": 0.09689877927303314, "learning_rate": 0.0001924282515774476, "loss": 1.0421, "step": 192 }, { "epoch": 0.07847123399064851, "grad_norm": 0.09406042098999023, "learning_rate": 0.00019238754325259517, "loss": 1.1042, "step": 193 }, { "epoch": 0.07887782069526326, "grad_norm": 0.08346063643693924, "learning_rate": 0.00019234683492774272, "loss": 0.9554, "step": 194 }, { "epoch": 0.07928440739987802, "grad_norm": 0.10317754745483398, "learning_rate": 0.00019230612660289028, "loss": 1.0835, "step": 195 }, { "epoch": 0.07969099410449278, "grad_norm": 0.08712919056415558, "learning_rate": 0.0001922654182780379, "loss": 0.9799, "step": 196 }, { "epoch": 0.08009758080910755, "grad_norm": 0.0860556811094284, "learning_rate": 0.00019222470995318544, "loss": 0.8661, "step": 197 }, { "epoch": 0.0805041675137223, "grad_norm": 0.07940655201673508, "learning_rate": 0.00019218400162833302, "loss": 0.8305, "step": 198 }, { "epoch": 0.08091075421833706, "grad_norm": 0.09200199693441391, "learning_rate": 0.00019214329330348058, "loss": 0.9774, "step": 199 }, { "epoch": 0.08131734092295181, "grad_norm": 0.09980164468288422, "learning_rate": 0.00019210258497862813, "loss": 0.9791, "step": 200 }, { "epoch": 0.08172392762756658, "grad_norm": 0.09660688042640686, "learning_rate": 0.0001920618766537757, "loss": 1.027, "step": 201 }, { "epoch": 0.08213051433218134, "grad_norm": 0.09518909454345703, "learning_rate": 0.00019202116832892327, "loss": 0.9939, "step": 202 }, { "epoch": 0.0825371010367961, "grad_norm": 0.0886114165186882, "learning_rate": 0.00019198046000407085, "loss": 0.985, "step": 203 }, { "epoch": 0.08294368774141085, "grad_norm": 0.09820783883333206, "learning_rate": 0.0001919397516792184, "loss": 1.0064, "step": 204 }, { "epoch": 0.08335027444602562, "grad_norm": 0.0957496389746666, "learning_rate": 0.00019189904335436598, "loss": 1.1126, "step": 205 }, { "epoch": 0.08375686115064038, "grad_norm": 0.09990067780017853, "learning_rate": 0.00019185833502951354, "loss": 1.1517, "step": 206 }, { "epoch": 0.08416344785525513, "grad_norm": 0.0953991562128067, "learning_rate": 0.0001918176267046611, "loss": 1.087, "step": 207 }, { "epoch": 0.08457003455986989, "grad_norm": 0.10291532427072525, "learning_rate": 0.0001917769183798087, "loss": 1.0366, "step": 208 }, { "epoch": 0.08497662126448464, "grad_norm": 0.09986121207475662, "learning_rate": 0.00019173621005495625, "loss": 0.9581, "step": 209 }, { "epoch": 0.08538320796909941, "grad_norm": 0.09369988739490509, "learning_rate": 0.00019169550173010383, "loss": 1.0048, "step": 210 }, { "epoch": 0.08578979467371417, "grad_norm": 0.0968063622713089, "learning_rate": 0.0001916547934052514, "loss": 1.0005, "step": 211 }, { "epoch": 0.08619638137832893, "grad_norm": 0.11241315305233002, "learning_rate": 0.00019161408508039894, "loss": 1.0316, "step": 212 }, { "epoch": 0.08660296808294368, "grad_norm": 0.09230878949165344, "learning_rate": 0.00019157337675554652, "loss": 0.917, "step": 213 }, { "epoch": 0.08700955478755845, "grad_norm": 0.08461520820856094, "learning_rate": 0.00019153266843069408, "loss": 0.9144, "step": 214 }, { "epoch": 0.08741614149217321, "grad_norm": 0.09011861681938171, "learning_rate": 0.00019149196010584166, "loss": 1.0092, "step": 215 }, { "epoch": 0.08782272819678796, "grad_norm": 0.09200841188430786, "learning_rate": 0.0001914512517809892, "loss": 1.0552, "step": 216 }, { "epoch": 0.08822931490140272, "grad_norm": 0.09052886068820953, "learning_rate": 0.0001914105434561368, "loss": 0.9067, "step": 217 }, { "epoch": 0.08863590160601749, "grad_norm": 0.08740741014480591, "learning_rate": 0.00019136983513128435, "loss": 0.9182, "step": 218 }, { "epoch": 0.08904248831063225, "grad_norm": 0.08494284749031067, "learning_rate": 0.00019132912680643193, "loss": 0.8321, "step": 219 }, { "epoch": 0.089449075015247, "grad_norm": 0.0890796035528183, "learning_rate": 0.0001912884184815795, "loss": 0.9801, "step": 220 }, { "epoch": 0.08985566171986176, "grad_norm": 0.094822458922863, "learning_rate": 0.00019124771015672706, "loss": 0.9779, "step": 221 }, { "epoch": 0.09026224842447651, "grad_norm": 0.09756983071565628, "learning_rate": 0.00019120700183187465, "loss": 1.0385, "step": 222 }, { "epoch": 0.09066883512909128, "grad_norm": 0.09434107691049576, "learning_rate": 0.0001911662935070222, "loss": 1.063, "step": 223 }, { "epoch": 0.09107542183370604, "grad_norm": 0.0925639271736145, "learning_rate": 0.00019112558518216975, "loss": 0.9061, "step": 224 }, { "epoch": 0.0914820085383208, "grad_norm": 0.10531201958656311, "learning_rate": 0.00019108487685731734, "loss": 1.1593, "step": 225 }, { "epoch": 0.09188859524293555, "grad_norm": 0.08259832113981247, "learning_rate": 0.0001910441685324649, "loss": 0.8463, "step": 226 }, { "epoch": 0.09229518194755032, "grad_norm": 431.5063171386719, "learning_rate": 0.00019100346020761247, "loss": 1.0632, "step": 227 }, { "epoch": 0.09270176865216508, "grad_norm": 0.10764740407466888, "learning_rate": 0.00019096275188276002, "loss": 1.0083, "step": 228 }, { "epoch": 0.09310835535677983, "grad_norm": 0.08872029185295105, "learning_rate": 0.0001909220435579076, "loss": 0.9301, "step": 229 }, { "epoch": 0.09351494206139459, "grad_norm": 0.1006346270442009, "learning_rate": 0.00019088133523305516, "loss": 1.0103, "step": 230 }, { "epoch": 0.09392152876600936, "grad_norm": 0.0970514565706253, "learning_rate": 0.00019084062690820274, "loss": 1.0522, "step": 231 }, { "epoch": 0.09432811547062411, "grad_norm": 0.09807727485895157, "learning_rate": 0.00019079991858335032, "loss": 1.0498, "step": 232 }, { "epoch": 0.09473470217523887, "grad_norm": 0.09828022867441177, "learning_rate": 0.00019075921025849788, "loss": 0.9871, "step": 233 }, { "epoch": 0.09514128887985362, "grad_norm": 0.10089042782783508, "learning_rate": 0.00019071850193364543, "loss": 0.977, "step": 234 }, { "epoch": 0.0955478755844684, "grad_norm": 0.09905245155096054, "learning_rate": 0.000190677793608793, "loss": 1.0135, "step": 235 }, { "epoch": 0.09595446228908315, "grad_norm": 0.1002473533153534, "learning_rate": 0.00019063708528394057, "loss": 1.0219, "step": 236 }, { "epoch": 0.0963610489936979, "grad_norm": 0.09028339385986328, "learning_rate": 0.00019059637695908815, "loss": 0.909, "step": 237 }, { "epoch": 0.09676763569831266, "grad_norm": 0.0950377881526947, "learning_rate": 0.0001905556686342357, "loss": 0.9749, "step": 238 }, { "epoch": 0.09717422240292742, "grad_norm": 0.09866049885749817, "learning_rate": 0.00019051496030938328, "loss": 1.0927, "step": 239 }, { "epoch": 0.09758080910754219, "grad_norm": 0.09754758328199387, "learning_rate": 0.00019047425198453084, "loss": 1.059, "step": 240 }, { "epoch": 0.09798739581215694, "grad_norm": 0.09261766821146011, "learning_rate": 0.00019043354365967842, "loss": 1.0912, "step": 241 }, { "epoch": 0.0983939825167717, "grad_norm": 0.08637125045061111, "learning_rate": 0.000190392835334826, "loss": 0.8925, "step": 242 }, { "epoch": 0.09880056922138646, "grad_norm": 0.0962812602519989, "learning_rate": 0.00019035212700997355, "loss": 1.0435, "step": 243 }, { "epoch": 0.09920715592600123, "grad_norm": 0.09047430753707886, "learning_rate": 0.00019031141868512113, "loss": 1.0787, "step": 244 }, { "epoch": 0.09961374263061598, "grad_norm": 0.09183438867330551, "learning_rate": 0.0001902707103602687, "loss": 0.9338, "step": 245 }, { "epoch": 0.10002032933523074, "grad_norm": 0.09977632761001587, "learning_rate": 0.00019023000203541624, "loss": 1.1605, "step": 246 }, { "epoch": 0.10042691603984549, "grad_norm": 0.10386580228805542, "learning_rate": 0.00019018929371056382, "loss": 1.0493, "step": 247 }, { "epoch": 0.10083350274446026, "grad_norm": 0.09106533974409103, "learning_rate": 0.00019014858538571138, "loss": 0.9891, "step": 248 }, { "epoch": 0.10124008944907502, "grad_norm": 0.09407884627580643, "learning_rate": 0.00019010787706085896, "loss": 1.0367, "step": 249 }, { "epoch": 0.10164667615368977, "grad_norm": 0.10133463889360428, "learning_rate": 0.0001900671687360065, "loss": 1.0743, "step": 250 }, { "epoch": 0.10205326285830453, "grad_norm": 0.11877205967903137, "learning_rate": 0.0001900264604111541, "loss": 1.1572, "step": 251 }, { "epoch": 0.10245984956291929, "grad_norm": 0.10216309130191803, "learning_rate": 0.00018998575208630165, "loss": 1.0687, "step": 252 }, { "epoch": 0.10286643626753406, "grad_norm": 0.09023922681808472, "learning_rate": 0.0001899450437614492, "loss": 0.9153, "step": 253 }, { "epoch": 0.10327302297214881, "grad_norm": 0.09972742944955826, "learning_rate": 0.0001899043354365968, "loss": 0.9059, "step": 254 }, { "epoch": 0.10367960967676357, "grad_norm": 0.1175752505660057, "learning_rate": 0.00018986362711174436, "loss": 1.0659, "step": 255 }, { "epoch": 0.10408619638137832, "grad_norm": 0.09030337631702423, "learning_rate": 0.00018982291878689195, "loss": 0.9577, "step": 256 }, { "epoch": 0.1044927830859931, "grad_norm": 0.08850797265768051, "learning_rate": 0.0001897822104620395, "loss": 0.9193, "step": 257 }, { "epoch": 0.10489936979060785, "grad_norm": 1767.7669677734375, "learning_rate": 0.00018974150213718705, "loss": 0.9977, "step": 258 }, { "epoch": 0.1053059564952226, "grad_norm": 0.11435185372829437, "learning_rate": 0.00018970079381233463, "loss": 1.0468, "step": 259 }, { "epoch": 0.10571254319983736, "grad_norm": 0.10342080891132355, "learning_rate": 0.0001896600854874822, "loss": 1.0119, "step": 260 }, { "epoch": 0.10611912990445213, "grad_norm": 0.11568263173103333, "learning_rate": 0.00018961937716262977, "loss": 1.025, "step": 261 }, { "epoch": 0.10652571660906689, "grad_norm": 0.12752321362495422, "learning_rate": 0.00018957866883777732, "loss": 1.1283, "step": 262 }, { "epoch": 0.10693230331368164, "grad_norm": 0.10688795894384384, "learning_rate": 0.0001895379605129249, "loss": 0.9052, "step": 263 }, { "epoch": 0.1073388900182964, "grad_norm": 0.10426552593708038, "learning_rate": 0.00018949725218807246, "loss": 0.9556, "step": 264 }, { "epoch": 0.10774547672291115, "grad_norm": 0.09953362494707108, "learning_rate": 0.00018945654386322004, "loss": 1.0734, "step": 265 }, { "epoch": 0.10815206342752592, "grad_norm": 0.09143470227718353, "learning_rate": 0.00018941583553836762, "loss": 1.0063, "step": 266 }, { "epoch": 0.10855865013214068, "grad_norm": 0.10831563919782639, "learning_rate": 0.00018937512721351518, "loss": 1.011, "step": 267 }, { "epoch": 0.10896523683675544, "grad_norm": 0.10352573543787003, "learning_rate": 0.00018933441888866276, "loss": 1.0625, "step": 268 }, { "epoch": 0.10937182354137019, "grad_norm": 0.09499429166316986, "learning_rate": 0.0001892937105638103, "loss": 0.8775, "step": 269 }, { "epoch": 0.10977841024598496, "grad_norm": 0.10296636819839478, "learning_rate": 0.00018925300223895787, "loss": 0.985, "step": 270 }, { "epoch": 0.11018499695059972, "grad_norm": 0.10464894771575928, "learning_rate": 0.00018921229391410545, "loss": 1.0051, "step": 271 }, { "epoch": 0.11059158365521447, "grad_norm": 0.09429532289505005, "learning_rate": 0.000189171585589253, "loss": 0.9793, "step": 272 }, { "epoch": 0.11099817035982923, "grad_norm": 0.09751992672681808, "learning_rate": 0.00018913087726440058, "loss": 1.0756, "step": 273 }, { "epoch": 0.111404757064444, "grad_norm": 0.11418993026018143, "learning_rate": 0.00018909016893954814, "loss": 1.0742, "step": 274 }, { "epoch": 0.11181134376905875, "grad_norm": 0.10320629924535751, "learning_rate": 0.00018904946061469572, "loss": 1.036, "step": 275 }, { "epoch": 0.11221793047367351, "grad_norm": 0.09697311371564865, "learning_rate": 0.00018900875228984327, "loss": 1.0317, "step": 276 }, { "epoch": 0.11262451717828827, "grad_norm": 0.09579788893461227, "learning_rate": 0.00018896804396499085, "loss": 0.9621, "step": 277 }, { "epoch": 0.11303110388290302, "grad_norm": 0.09918879717588425, "learning_rate": 0.00018892733564013843, "loss": 1.0292, "step": 278 }, { "epoch": 0.11343769058751779, "grad_norm": 0.0923212468624115, "learning_rate": 0.000188886627315286, "loss": 1.0611, "step": 279 }, { "epoch": 0.11384427729213255, "grad_norm": 0.09480055421590805, "learning_rate": 0.00018884591899043357, "loss": 0.9809, "step": 280 }, { "epoch": 0.1142508639967473, "grad_norm": 0.09431526064872742, "learning_rate": 0.00018880521066558112, "loss": 1.0326, "step": 281 }, { "epoch": 0.11465745070136206, "grad_norm": 0.09080514311790466, "learning_rate": 0.00018876450234072868, "loss": 0.9115, "step": 282 }, { "epoch": 0.11506403740597683, "grad_norm": 0.10855970531702042, "learning_rate": 0.00018872379401587626, "loss": 1.0422, "step": 283 }, { "epoch": 0.11547062411059159, "grad_norm": 0.0941060334444046, "learning_rate": 0.0001886830856910238, "loss": 1.0352, "step": 284 }, { "epoch": 0.11587721081520634, "grad_norm": 0.08903583139181137, "learning_rate": 0.0001886423773661714, "loss": 0.964, "step": 285 }, { "epoch": 0.1162837975198211, "grad_norm": 0.08521820604801178, "learning_rate": 0.00018860166904131895, "loss": 0.917, "step": 286 }, { "epoch": 0.11669038422443587, "grad_norm": 0.1058691143989563, "learning_rate": 0.00018856096071646653, "loss": 1.0375, "step": 287 }, { "epoch": 0.11709697092905062, "grad_norm": 0.09435714781284332, "learning_rate": 0.0001885202523916141, "loss": 0.9766, "step": 288 }, { "epoch": 0.11750355763366538, "grad_norm": 0.09868729114532471, "learning_rate": 0.00018847954406676166, "loss": 1.1059, "step": 289 }, { "epoch": 0.11791014433828013, "grad_norm": 0.08855635672807693, "learning_rate": 0.00018843883574190924, "loss": 0.9424, "step": 290 }, { "epoch": 0.11831673104289489, "grad_norm": 0.09142837673425674, "learning_rate": 0.0001883981274170568, "loss": 1.0425, "step": 291 }, { "epoch": 0.11872331774750966, "grad_norm": 0.0971277505159378, "learning_rate": 0.00018835741909220438, "loss": 1.108, "step": 292 }, { "epoch": 0.11912990445212442, "grad_norm": 0.09940122812986374, "learning_rate": 0.00018831671076735193, "loss": 1.0172, "step": 293 }, { "epoch": 0.11953649115673917, "grad_norm": 0.10263317078351974, "learning_rate": 0.0001882760024424995, "loss": 1.0956, "step": 294 }, { "epoch": 0.11994307786135393, "grad_norm": 0.1092846542596817, "learning_rate": 0.00018823529411764707, "loss": 0.9454, "step": 295 }, { "epoch": 0.1203496645659687, "grad_norm": 0.10364726930856705, "learning_rate": 0.00018819458579279462, "loss": 0.8884, "step": 296 }, { "epoch": 0.12075625127058345, "grad_norm": 0.0889100730419159, "learning_rate": 0.0001881538774679422, "loss": 0.9922, "step": 297 }, { "epoch": 0.12116283797519821, "grad_norm": 0.09209653735160828, "learning_rate": 0.00018811316914308976, "loss": 0.977, "step": 298 }, { "epoch": 0.12156942467981297, "grad_norm": 0.11542046815156937, "learning_rate": 0.00018807246081823734, "loss": 1.0694, "step": 299 }, { "epoch": 0.12197601138442773, "grad_norm": 0.10896503180265427, "learning_rate": 0.00018803175249338492, "loss": 1.0508, "step": 300 }, { "epoch": 0.12238259808904249, "grad_norm": 0.09302002936601639, "learning_rate": 0.00018799104416853248, "loss": 1.0512, "step": 301 }, { "epoch": 0.12278918479365725, "grad_norm": 0.09081271290779114, "learning_rate": 0.00018795033584368006, "loss": 0.9688, "step": 302 }, { "epoch": 0.123195771498272, "grad_norm": 0.1059931218624115, "learning_rate": 0.0001879096275188276, "loss": 1.0483, "step": 303 }, { "epoch": 0.12360235820288677, "grad_norm": 0.1018669605255127, "learning_rate": 0.0001878689191939752, "loss": 1.019, "step": 304 }, { "epoch": 0.12400894490750153, "grad_norm": 0.1040007546544075, "learning_rate": 0.00018782821086912275, "loss": 1.037, "step": 305 }, { "epoch": 0.12441553161211628, "grad_norm": 0.10204601287841797, "learning_rate": 0.0001877875025442703, "loss": 0.9816, "step": 306 }, { "epoch": 0.12482211831673104, "grad_norm": 0.10591764748096466, "learning_rate": 0.00018774679421941788, "loss": 1.0939, "step": 307 }, { "epoch": 0.1252287050213458, "grad_norm": 0.09306305646896362, "learning_rate": 0.00018770608589456544, "loss": 1.0476, "step": 308 }, { "epoch": 0.12563529172596055, "grad_norm": 11.22681713104248, "learning_rate": 0.00018766537756971302, "loss": 1.0573, "step": 309 }, { "epoch": 0.12604187843057532, "grad_norm": 0.09422402083873749, "learning_rate": 0.00018762466924486057, "loss": 0.9993, "step": 310 }, { "epoch": 0.1264484651351901, "grad_norm": 0.0982229933142662, "learning_rate": 0.00018758396092000815, "loss": 0.9159, "step": 311 }, { "epoch": 0.12685505183980483, "grad_norm": 0.12579265236854553, "learning_rate": 0.00018754325259515573, "loss": 1.0935, "step": 312 }, { "epoch": 0.1272616385444196, "grad_norm": 0.10069390386343002, "learning_rate": 0.0001875025442703033, "loss": 1.0127, "step": 313 }, { "epoch": 0.12766822524903434, "grad_norm": 0.10948827862739563, "learning_rate": 0.00018746183594545087, "loss": 1.0576, "step": 314 }, { "epoch": 0.12807481195364911, "grad_norm": 0.09232445061206818, "learning_rate": 0.00018742112762059842, "loss": 0.9856, "step": 315 }, { "epoch": 0.12848139865826388, "grad_norm": 0.08319563418626785, "learning_rate": 0.000187380419295746, "loss": 0.9172, "step": 316 }, { "epoch": 0.12888798536287863, "grad_norm": 0.09697309136390686, "learning_rate": 0.00018733971097089356, "loss": 1.0567, "step": 317 }, { "epoch": 0.1292945720674934, "grad_norm": 0.09254255145788193, "learning_rate": 0.0001872990026460411, "loss": 1.0177, "step": 318 }, { "epoch": 0.12970115877210814, "grad_norm": 0.09254108369350433, "learning_rate": 0.0001872582943211887, "loss": 1.0079, "step": 319 }, { "epoch": 0.1301077454767229, "grad_norm": 0.09095866233110428, "learning_rate": 0.00018721758599633625, "loss": 1.0633, "step": 320 }, { "epoch": 0.13051433218133768, "grad_norm": 0.09073010087013245, "learning_rate": 0.00018717687767148383, "loss": 0.9059, "step": 321 }, { "epoch": 0.13092091888595242, "grad_norm": 0.09842764586210251, "learning_rate": 0.00018713616934663138, "loss": 1.0766, "step": 322 }, { "epoch": 0.1313275055905672, "grad_norm": 0.09325529634952545, "learning_rate": 0.00018709546102177896, "loss": 1.066, "step": 323 }, { "epoch": 0.13173409229518196, "grad_norm": 0.09692969918251038, "learning_rate": 0.00018705475269692654, "loss": 0.9743, "step": 324 }, { "epoch": 0.1321406789997967, "grad_norm": 0.09432708472013474, "learning_rate": 0.0001870140443720741, "loss": 1.0141, "step": 325 }, { "epoch": 0.13254726570441147, "grad_norm": 0.09226994961500168, "learning_rate": 0.00018697333604722168, "loss": 0.9837, "step": 326 }, { "epoch": 0.1329538524090262, "grad_norm": 0.10843974351882935, "learning_rate": 0.00018693262772236923, "loss": 1.0248, "step": 327 }, { "epoch": 0.13336043911364098, "grad_norm": 0.09324774891138077, "learning_rate": 0.00018689191939751681, "loss": 1.0642, "step": 328 }, { "epoch": 0.13376702581825575, "grad_norm": 0.08934729546308517, "learning_rate": 0.00018685121107266437, "loss": 0.9792, "step": 329 }, { "epoch": 0.1341736125228705, "grad_norm": 0.09125274419784546, "learning_rate": 0.00018681050274781192, "loss": 1.0093, "step": 330 }, { "epoch": 0.13458019922748526, "grad_norm": 0.09645108133554459, "learning_rate": 0.0001867697944229595, "loss": 0.9503, "step": 331 }, { "epoch": 0.13498678593210003, "grad_norm": 0.09900861978530884, "learning_rate": 0.00018672908609810706, "loss": 0.9966, "step": 332 }, { "epoch": 0.13539337263671478, "grad_norm": 0.09018311649560928, "learning_rate": 0.00018668837777325464, "loss": 0.965, "step": 333 }, { "epoch": 0.13579995934132955, "grad_norm": 0.10296136885881424, "learning_rate": 0.00018664766944840222, "loss": 1.1011, "step": 334 }, { "epoch": 0.1362065460459443, "grad_norm": 0.09104129672050476, "learning_rate": 0.00018660696112354977, "loss": 0.9814, "step": 335 }, { "epoch": 0.13661313275055906, "grad_norm": 0.09881450235843658, "learning_rate": 0.00018656625279869736, "loss": 1.0989, "step": 336 }, { "epoch": 0.13701971945517383, "grad_norm": 0.09691241383552551, "learning_rate": 0.0001865255444738449, "loss": 1.0967, "step": 337 }, { "epoch": 0.13742630615978857, "grad_norm": 0.10152243077754974, "learning_rate": 0.0001864848361489925, "loss": 1.0951, "step": 338 }, { "epoch": 0.13783289286440334, "grad_norm": 0.10802541673183441, "learning_rate": 0.00018644412782414005, "loss": 0.8742, "step": 339 }, { "epoch": 0.13823947956901808, "grad_norm": 0.09942565858364105, "learning_rate": 0.0001864034194992876, "loss": 0.9961, "step": 340 }, { "epoch": 0.13864606627363285, "grad_norm": 0.08618199825286865, "learning_rate": 0.00018636271117443518, "loss": 0.9645, "step": 341 }, { "epoch": 0.13905265297824762, "grad_norm": 0.1056099608540535, "learning_rate": 0.00018632200284958273, "loss": 0.9885, "step": 342 }, { "epoch": 0.13945923968286236, "grad_norm": 0.08862382173538208, "learning_rate": 0.00018628129452473032, "loss": 0.9316, "step": 343 }, { "epoch": 0.13986582638747713, "grad_norm": 0.09923135489225388, "learning_rate": 0.00018624058619987787, "loss": 0.9959, "step": 344 }, { "epoch": 0.1402724130920919, "grad_norm": 0.09120538830757141, "learning_rate": 0.00018619987787502545, "loss": 0.968, "step": 345 }, { "epoch": 0.14067899979670664, "grad_norm": 0.09669141471385956, "learning_rate": 0.00018615916955017303, "loss": 1.085, "step": 346 }, { "epoch": 0.1410855865013214, "grad_norm": 0.08598754554986954, "learning_rate": 0.00018611846122532059, "loss": 0.9504, "step": 347 }, { "epoch": 0.14149217320593616, "grad_norm": 0.09238371253013611, "learning_rate": 0.00018607775290046817, "loss": 0.9742, "step": 348 }, { "epoch": 0.14189875991055093, "grad_norm": 0.091258205473423, "learning_rate": 0.00018603704457561572, "loss": 0.9341, "step": 349 }, { "epoch": 0.1423053466151657, "grad_norm": 0.10129548609256744, "learning_rate": 0.0001859963362507633, "loss": 1.0814, "step": 350 }, { "epoch": 0.14271193331978044, "grad_norm": 0.09523019194602966, "learning_rate": 0.00018595562792591086, "loss": 0.9848, "step": 351 }, { "epoch": 0.1431185200243952, "grad_norm": 0.09485248476266861, "learning_rate": 0.0001859149196010584, "loss": 0.9828, "step": 352 }, { "epoch": 0.14352510672900995, "grad_norm": 0.09963666647672653, "learning_rate": 0.000185874211276206, "loss": 1.1075, "step": 353 }, { "epoch": 0.14393169343362472, "grad_norm": 0.09067155420780182, "learning_rate": 0.00018583350295135355, "loss": 0.971, "step": 354 }, { "epoch": 0.1443382801382395, "grad_norm": 0.09153544157743454, "learning_rate": 0.00018579279462650113, "loss": 0.9405, "step": 355 }, { "epoch": 0.14474486684285423, "grad_norm": 0.1024472787976265, "learning_rate": 0.00018575208630164868, "loss": 0.9967, "step": 356 }, { "epoch": 0.145151453547469, "grad_norm": 0.09804495424032211, "learning_rate": 0.00018571137797679626, "loss": 0.9578, "step": 357 }, { "epoch": 0.14555804025208377, "grad_norm": 0.099054716527462, "learning_rate": 0.00018567066965194384, "loss": 0.9999, "step": 358 }, { "epoch": 0.1459646269566985, "grad_norm": 0.09781336784362793, "learning_rate": 0.0001856299613270914, "loss": 1.09, "step": 359 }, { "epoch": 0.14637121366131328, "grad_norm": 0.08993211388587952, "learning_rate": 0.00018558925300223898, "loss": 1.0719, "step": 360 }, { "epoch": 0.14677780036592802, "grad_norm": 0.09146003425121307, "learning_rate": 0.00018554854467738653, "loss": 1.0008, "step": 361 }, { "epoch": 0.1471843870705428, "grad_norm": 0.09643495827913284, "learning_rate": 0.00018550783635253411, "loss": 1.0791, "step": 362 }, { "epoch": 0.14759097377515756, "grad_norm": 0.09078676998615265, "learning_rate": 0.00018546712802768167, "loss": 0.8641, "step": 363 }, { "epoch": 0.1479975604797723, "grad_norm": 0.08719085901975632, "learning_rate": 0.00018542641970282922, "loss": 0.985, "step": 364 }, { "epoch": 0.14840414718438708, "grad_norm": 0.09189736843109131, "learning_rate": 0.0001853857113779768, "loss": 0.9638, "step": 365 }, { "epoch": 0.14881073388900182, "grad_norm": 0.09381456673145294, "learning_rate": 0.00018534500305312436, "loss": 1.0036, "step": 366 }, { "epoch": 0.1492173205936166, "grad_norm": 0.0922684445977211, "learning_rate": 0.00018530429472827194, "loss": 1.0391, "step": 367 }, { "epoch": 0.14962390729823136, "grad_norm": 0.09465248882770538, "learning_rate": 0.0001852635864034195, "loss": 0.8874, "step": 368 }, { "epoch": 0.1500304940028461, "grad_norm": 0.0938408225774765, "learning_rate": 0.00018522287807856707, "loss": 1.0269, "step": 369 }, { "epoch": 0.15043708070746087, "grad_norm": 0.09377933293581009, "learning_rate": 0.00018518216975371466, "loss": 1.0142, "step": 370 }, { "epoch": 0.15084366741207564, "grad_norm": 0.1117277517914772, "learning_rate": 0.0001851414614288622, "loss": 1.0371, "step": 371 }, { "epoch": 0.15125025411669038, "grad_norm": 0.10293183475732803, "learning_rate": 0.0001851007531040098, "loss": 1.0, "step": 372 }, { "epoch": 0.15165684082130515, "grad_norm": 0.09216313809156418, "learning_rate": 0.00018506004477915734, "loss": 0.9703, "step": 373 }, { "epoch": 0.1520634275259199, "grad_norm": 0.09088669717311859, "learning_rate": 0.00018501933645430493, "loss": 0.8766, "step": 374 }, { "epoch": 0.15247001423053466, "grad_norm": 0.09916643798351288, "learning_rate": 0.00018497862812945248, "loss": 1.0958, "step": 375 }, { "epoch": 0.15287660093514943, "grad_norm": 0.08404985070228577, "learning_rate": 0.00018493791980460003, "loss": 0.9602, "step": 376 }, { "epoch": 0.15328318763976417, "grad_norm": 0.10011377185583115, "learning_rate": 0.00018489721147974762, "loss": 1.0377, "step": 377 }, { "epoch": 0.15368977434437894, "grad_norm": 0.09958089143037796, "learning_rate": 0.00018485650315489517, "loss": 1.0213, "step": 378 }, { "epoch": 0.15409636104899369, "grad_norm": 0.09488838911056519, "learning_rate": 0.00018481579483004275, "loss": 0.941, "step": 379 }, { "epoch": 0.15450294775360846, "grad_norm": 0.09099314361810684, "learning_rate": 0.00018477508650519033, "loss": 0.8913, "step": 380 }, { "epoch": 0.15490953445822322, "grad_norm": 0.0956854447722435, "learning_rate": 0.00018473437818033789, "loss": 1.1478, "step": 381 }, { "epoch": 0.15531612116283797, "grad_norm": 0.11225584149360657, "learning_rate": 0.00018469366985548547, "loss": 1.0795, "step": 382 }, { "epoch": 0.15572270786745274, "grad_norm": 0.11592987924814224, "learning_rate": 0.00018465296153063302, "loss": 1.0863, "step": 383 }, { "epoch": 0.1561292945720675, "grad_norm": 0.09232570976018906, "learning_rate": 0.0001846122532057806, "loss": 0.9551, "step": 384 }, { "epoch": 0.15653588127668225, "grad_norm": 0.08860056847333908, "learning_rate": 0.00018457154488092816, "loss": 1.0206, "step": 385 }, { "epoch": 0.15694246798129702, "grad_norm": 0.10788331180810928, "learning_rate": 0.00018453083655607574, "loss": 0.9378, "step": 386 }, { "epoch": 0.15734905468591176, "grad_norm": 0.10758615285158157, "learning_rate": 0.0001844901282312233, "loss": 1.1149, "step": 387 }, { "epoch": 0.15775564139052653, "grad_norm": 0.10551386326551437, "learning_rate": 0.00018444941990637085, "loss": 1.0729, "step": 388 }, { "epoch": 0.1581622280951413, "grad_norm": 0.08733198046684265, "learning_rate": 0.00018440871158151843, "loss": 1.0058, "step": 389 }, { "epoch": 0.15856881479975604, "grad_norm": 0.1095399409532547, "learning_rate": 0.00018436800325666598, "loss": 1.0566, "step": 390 }, { "epoch": 0.1589754015043708, "grad_norm": 0.12356330454349518, "learning_rate": 0.00018432729493181356, "loss": 1.0173, "step": 391 }, { "epoch": 0.15938198820898555, "grad_norm": 0.09934639930725098, "learning_rate": 0.00018428658660696114, "loss": 1.1237, "step": 392 }, { "epoch": 0.15978857491360032, "grad_norm": 0.09402013570070267, "learning_rate": 0.0001842458782821087, "loss": 1.0018, "step": 393 }, { "epoch": 0.1601951616182151, "grad_norm": 0.10511749237775803, "learning_rate": 0.00018420516995725628, "loss": 0.9844, "step": 394 }, { "epoch": 0.16060174832282983, "grad_norm": 0.11193688213825226, "learning_rate": 0.00018416446163240383, "loss": 0.9888, "step": 395 }, { "epoch": 0.1610083350274446, "grad_norm": 0.09895443916320801, "learning_rate": 0.00018412375330755141, "loss": 1.1045, "step": 396 }, { "epoch": 0.16141492173205937, "grad_norm": 0.09660319238901138, "learning_rate": 0.00018408304498269897, "loss": 1.0457, "step": 397 }, { "epoch": 0.16182150843667412, "grad_norm": 0.1339186728000641, "learning_rate": 0.00018404233665784655, "loss": 1.1266, "step": 398 }, { "epoch": 0.16222809514128889, "grad_norm": 0.1154564693570137, "learning_rate": 0.0001840016283329941, "loss": 1.0299, "step": 399 }, { "epoch": 0.16263468184590363, "grad_norm": 0.09698904305696487, "learning_rate": 0.00018396092000814166, "loss": 1.1101, "step": 400 }, { "epoch": 0.1630412685505184, "grad_norm": 0.09455164521932602, "learning_rate": 0.00018392021168328924, "loss": 0.9928, "step": 401 }, { "epoch": 0.16344785525513317, "grad_norm": 0.09728690981864929, "learning_rate": 0.0001838795033584368, "loss": 1.0603, "step": 402 }, { "epoch": 0.1638544419597479, "grad_norm": 0.10577269643545151, "learning_rate": 0.0001838387950335844, "loss": 0.9922, "step": 403 }, { "epoch": 0.16426102866436268, "grad_norm": 0.08850935101509094, "learning_rate": 0.00018379808670873196, "loss": 0.9758, "step": 404 }, { "epoch": 0.16466761536897742, "grad_norm": 0.09496256709098816, "learning_rate": 0.0001837573783838795, "loss": 1.0949, "step": 405 }, { "epoch": 0.1650742020735922, "grad_norm": 0.09768050909042358, "learning_rate": 0.0001837166700590271, "loss": 1.0054, "step": 406 }, { "epoch": 0.16548078877820696, "grad_norm": 0.09913921356201172, "learning_rate": 0.00018367596173417464, "loss": 1.0272, "step": 407 }, { "epoch": 0.1658873754828217, "grad_norm": 0.0901927724480629, "learning_rate": 0.00018363525340932223, "loss": 1.0264, "step": 408 }, { "epoch": 0.16629396218743647, "grad_norm": 0.09796515852212906, "learning_rate": 0.00018359454508446978, "loss": 1.0338, "step": 409 }, { "epoch": 0.16670054889205124, "grad_norm": 0.1018638014793396, "learning_rate": 0.00018355383675961736, "loss": 1.0409, "step": 410 }, { "epoch": 0.16710713559666598, "grad_norm": 0.10666611790657043, "learning_rate": 0.00018351312843476492, "loss": 1.0924, "step": 411 }, { "epoch": 0.16751372230128075, "grad_norm": 0.0986141785979271, "learning_rate": 0.00018347242010991247, "loss": 0.9468, "step": 412 }, { "epoch": 0.1679203090058955, "grad_norm": 0.09429168701171875, "learning_rate": 0.00018343171178506005, "loss": 0.9706, "step": 413 }, { "epoch": 0.16832689571051027, "grad_norm": 0.09704872965812683, "learning_rate": 0.0001833910034602076, "loss": 1.0692, "step": 414 }, { "epoch": 0.16873348241512504, "grad_norm": 0.0980519950389862, "learning_rate": 0.00018335029513535519, "loss": 1.0218, "step": 415 }, { "epoch": 0.16914006911973978, "grad_norm": 0.08980212360620499, "learning_rate": 0.00018330958681050277, "loss": 0.9243, "step": 416 }, { "epoch": 0.16954665582435455, "grad_norm": 0.09630506485700607, "learning_rate": 0.00018326887848565032, "loss": 0.9599, "step": 417 }, { "epoch": 0.1699532425289693, "grad_norm": 0.08608522266149521, "learning_rate": 0.0001832281701607979, "loss": 0.9577, "step": 418 }, { "epoch": 0.17035982923358406, "grad_norm": 0.09151248633861542, "learning_rate": 0.00018318746183594546, "loss": 0.9956, "step": 419 }, { "epoch": 0.17076641593819883, "grad_norm": 0.09689094871282578, "learning_rate": 0.00018314675351109304, "loss": 1.0999, "step": 420 }, { "epoch": 0.17117300264281357, "grad_norm": 0.09316612035036087, "learning_rate": 0.0001831060451862406, "loss": 0.8572, "step": 421 }, { "epoch": 0.17157958934742834, "grad_norm": 0.11449979990720749, "learning_rate": 0.00018306533686138817, "loss": 1.0328, "step": 422 }, { "epoch": 0.1719861760520431, "grad_norm": 0.10802194476127625, "learning_rate": 0.00018302462853653573, "loss": 0.9785, "step": 423 }, { "epoch": 0.17239276275665785, "grad_norm": 0.09997294098138809, "learning_rate": 0.00018298392021168328, "loss": 0.9778, "step": 424 }, { "epoch": 0.17279934946127262, "grad_norm": 0.10244690626859665, "learning_rate": 0.00018294321188683086, "loss": 1.0874, "step": 425 }, { "epoch": 0.17320593616588736, "grad_norm": 0.10659472644329071, "learning_rate": 0.00018290250356197844, "loss": 1.0196, "step": 426 }, { "epoch": 0.17361252287050213, "grad_norm": 0.09812036156654358, "learning_rate": 0.000182861795237126, "loss": 0.9051, "step": 427 }, { "epoch": 0.1740191095751169, "grad_norm": 0.845235288143158, "learning_rate": 0.00018282108691227358, "loss": 1.0531, "step": 428 }, { "epoch": 0.17442569627973165, "grad_norm": 0.109995998442173, "learning_rate": 0.00018278037858742113, "loss": 1.001, "step": 429 }, { "epoch": 0.17483228298434642, "grad_norm": 0.12578758597373962, "learning_rate": 0.00018273967026256871, "loss": 0.9513, "step": 430 }, { "epoch": 0.17523886968896116, "grad_norm": 0.1585826873779297, "learning_rate": 0.00018269896193771627, "loss": 1.0091, "step": 431 }, { "epoch": 0.17564545639357593, "grad_norm": 0.15150819718837738, "learning_rate": 0.00018265825361286385, "loss": 1.1045, "step": 432 }, { "epoch": 0.1760520430981907, "grad_norm": 0.1110219806432724, "learning_rate": 0.0001826175452880114, "loss": 0.9877, "step": 433 }, { "epoch": 0.17645862980280544, "grad_norm": 0.11296675354242325, "learning_rate": 0.00018257683696315896, "loss": 1.1317, "step": 434 }, { "epoch": 0.1768652165074202, "grad_norm": 0.11464451253414154, "learning_rate": 0.00018253612863830654, "loss": 0.9485, "step": 435 }, { "epoch": 0.17727180321203498, "grad_norm": 0.08836513012647629, "learning_rate": 0.0001824954203134541, "loss": 0.8667, "step": 436 }, { "epoch": 0.17767838991664972, "grad_norm": 0.10697431862354279, "learning_rate": 0.00018245471198860167, "loss": 1.0692, "step": 437 }, { "epoch": 0.1780849766212645, "grad_norm": 0.10565032064914703, "learning_rate": 0.00018241400366374925, "loss": 1.0723, "step": 438 }, { "epoch": 0.17849156332587923, "grad_norm": 0.11343531310558319, "learning_rate": 0.0001823732953388968, "loss": 1.1038, "step": 439 }, { "epoch": 0.178898150030494, "grad_norm": 0.10002034902572632, "learning_rate": 0.0001823325870140444, "loss": 0.9859, "step": 440 }, { "epoch": 0.17930473673510877, "grad_norm": 0.10602378845214844, "learning_rate": 0.00018229187868919194, "loss": 1.1091, "step": 441 }, { "epoch": 0.1797113234397235, "grad_norm": 0.09775001555681229, "learning_rate": 0.00018225117036433953, "loss": 1.0473, "step": 442 }, { "epoch": 0.18011791014433828, "grad_norm": 0.09872320294380188, "learning_rate": 0.00018221046203948708, "loss": 1.0657, "step": 443 }, { "epoch": 0.18052449684895303, "grad_norm": 0.0893816128373146, "learning_rate": 0.00018216975371463466, "loss": 0.915, "step": 444 }, { "epoch": 0.1809310835535678, "grad_norm": 0.09870447218418121, "learning_rate": 0.00018212904538978221, "loss": 0.8847, "step": 445 }, { "epoch": 0.18133767025818257, "grad_norm": 0.09775330871343613, "learning_rate": 0.00018208833706492977, "loss": 0.841, "step": 446 }, { "epoch": 0.1817442569627973, "grad_norm": 0.10025996714830399, "learning_rate": 0.00018204762874007735, "loss": 0.9965, "step": 447 }, { "epoch": 0.18215084366741208, "grad_norm": 0.09369905292987823, "learning_rate": 0.0001820069204152249, "loss": 0.9998, "step": 448 }, { "epoch": 0.18255743037202685, "grad_norm": 0.09244808554649353, "learning_rate": 0.0001819662120903725, "loss": 0.9938, "step": 449 }, { "epoch": 0.1829640170766416, "grad_norm": 0.12163155525922775, "learning_rate": 0.00018192550376552007, "loss": 1.1384, "step": 450 }, { "epoch": 0.18337060378125636, "grad_norm": 0.08755457401275635, "learning_rate": 0.00018188479544066762, "loss": 0.9002, "step": 451 }, { "epoch": 0.1837771904858711, "grad_norm": 0.0917607769370079, "learning_rate": 0.0001818440871158152, "loss": 0.9874, "step": 452 }, { "epoch": 0.18418377719048587, "grad_norm": 0.09113719314336777, "learning_rate": 0.00018180337879096276, "loss": 1.0187, "step": 453 }, { "epoch": 0.18459036389510064, "grad_norm": 0.08795943111181259, "learning_rate": 0.00018176267046611034, "loss": 0.902, "step": 454 }, { "epoch": 0.18499695059971538, "grad_norm": 0.1016731783747673, "learning_rate": 0.0001817219621412579, "loss": 0.9933, "step": 455 }, { "epoch": 0.18540353730433015, "grad_norm": 0.09413068741559982, "learning_rate": 0.00018168125381640547, "loss": 0.9448, "step": 456 }, { "epoch": 0.18581012400894492, "grad_norm": 0.10015012323856354, "learning_rate": 0.00018164054549155303, "loss": 1.1458, "step": 457 }, { "epoch": 0.18621671071355966, "grad_norm": 0.09086768329143524, "learning_rate": 0.00018159983716670058, "loss": 1.0543, "step": 458 }, { "epoch": 0.18662329741817443, "grad_norm": 0.10910352319478989, "learning_rate": 0.00018155912884184816, "loss": 1.0078, "step": 459 }, { "epoch": 0.18702988412278918, "grad_norm": 0.09674135595560074, "learning_rate": 0.00018151842051699572, "loss": 0.9758, "step": 460 }, { "epoch": 0.18743647082740394, "grad_norm": 0.09108126163482666, "learning_rate": 0.00018147771219214332, "loss": 1.0038, "step": 461 }, { "epoch": 0.18784305753201871, "grad_norm": 0.09710326045751572, "learning_rate": 0.00018143700386729088, "loss": 0.9693, "step": 462 }, { "epoch": 0.18824964423663346, "grad_norm": 0.10069318860769272, "learning_rate": 0.00018139629554243843, "loss": 1.1005, "step": 463 }, { "epoch": 0.18865623094124823, "grad_norm": 0.09434141218662262, "learning_rate": 0.000181355587217586, "loss": 1.0359, "step": 464 }, { "epoch": 0.18906281764586297, "grad_norm": 0.09208261221647263, "learning_rate": 0.00018131487889273357, "loss": 1.0374, "step": 465 }, { "epoch": 0.18946940435047774, "grad_norm": 0.09581121802330017, "learning_rate": 0.00018127417056788115, "loss": 1.0267, "step": 466 }, { "epoch": 0.1898759910550925, "grad_norm": 0.09809669107198715, "learning_rate": 0.0001812334622430287, "loss": 1.0652, "step": 467 }, { "epoch": 0.19028257775970725, "grad_norm": 0.08496394008398056, "learning_rate": 0.00018119275391817628, "loss": 0.9468, "step": 468 }, { "epoch": 0.19068916446432202, "grad_norm": 0.09247399121522903, "learning_rate": 0.00018115204559332384, "loss": 1.0247, "step": 469 }, { "epoch": 0.1910957511689368, "grad_norm": 0.10010971128940582, "learning_rate": 0.0001811113372684714, "loss": 0.9674, "step": 470 }, { "epoch": 0.19150233787355153, "grad_norm": 0.09562191367149353, "learning_rate": 0.00018107062894361897, "loss": 0.9819, "step": 471 }, { "epoch": 0.1919089245781663, "grad_norm": 0.09223975241184235, "learning_rate": 0.00018102992061876655, "loss": 1.0051, "step": 472 }, { "epoch": 0.19231551128278104, "grad_norm": 0.09564565122127533, "learning_rate": 0.00018098921229391414, "loss": 0.908, "step": 473 }, { "epoch": 0.1927220979873958, "grad_norm": 0.09371364116668701, "learning_rate": 0.0001809485039690617, "loss": 1.0195, "step": 474 }, { "epoch": 0.19312868469201058, "grad_norm": 0.0895533412694931, "learning_rate": 0.00018090779564420924, "loss": 0.8912, "step": 475 }, { "epoch": 0.19353527139662532, "grad_norm": 0.08874888718128204, "learning_rate": 0.00018086708731935682, "loss": 0.9941, "step": 476 }, { "epoch": 0.1939418581012401, "grad_norm": 8989.1748046875, "learning_rate": 0.00018082637899450438, "loss": 1.0191, "step": 477 }, { "epoch": 0.19434844480585484, "grad_norm": 0.09893982112407684, "learning_rate": 0.00018078567066965196, "loss": 1.1682, "step": 478 }, { "epoch": 0.1947550315104696, "grad_norm": 0.09100797772407532, "learning_rate": 0.00018074496234479951, "loss": 0.9466, "step": 479 }, { "epoch": 0.19516161821508438, "grad_norm": 0.10540256649255753, "learning_rate": 0.0001807042540199471, "loss": 1.0735, "step": 480 }, { "epoch": 0.19556820491969912, "grad_norm": 0.09110235422849655, "learning_rate": 0.00018066354569509465, "loss": 1.0097, "step": 481 }, { "epoch": 0.1959747916243139, "grad_norm": 0.10651825368404388, "learning_rate": 0.0001806228373702422, "loss": 1.014, "step": 482 }, { "epoch": 0.19638137832892866, "grad_norm": 0.08685674518346786, "learning_rate": 0.00018058212904538978, "loss": 0.9755, "step": 483 }, { "epoch": 0.1967879650335434, "grad_norm": 0.10092045366764069, "learning_rate": 0.00018054142072053737, "loss": 0.9397, "step": 484 }, { "epoch": 0.19719455173815817, "grad_norm": 0.1056622639298439, "learning_rate": 0.00018050071239568495, "loss": 0.9864, "step": 485 }, { "epoch": 0.1976011384427729, "grad_norm": 0.10525202006101608, "learning_rate": 0.0001804600040708325, "loss": 1.1085, "step": 486 }, { "epoch": 0.19800772514738768, "grad_norm": 0.10073073953390121, "learning_rate": 0.00018041929574598006, "loss": 1.1264, "step": 487 }, { "epoch": 0.19841431185200245, "grad_norm": 0.09659091383218765, "learning_rate": 0.00018037858742112764, "loss": 0.9848, "step": 488 }, { "epoch": 0.1988208985566172, "grad_norm": 0.09986629337072372, "learning_rate": 0.0001803378790962752, "loss": 1.0732, "step": 489 }, { "epoch": 0.19922748526123196, "grad_norm": 0.11215290427207947, "learning_rate": 0.00018029717077142277, "loss": 1.1259, "step": 490 }, { "epoch": 0.1996340719658467, "grad_norm": 0.11136343330144882, "learning_rate": 0.00018025646244657033, "loss": 1.0857, "step": 491 }, { "epoch": 0.20004065867046147, "grad_norm": 0.10452030599117279, "learning_rate": 0.0001802157541217179, "loss": 0.9997, "step": 492 }, { "epoch": 0.20044724537507624, "grad_norm": 0.10394178330898285, "learning_rate": 0.00018017504579686546, "loss": 1.0852, "step": 493 }, { "epoch": 0.20085383207969099, "grad_norm": 0.10206598043441772, "learning_rate": 0.00018013433747201302, "loss": 0.9629, "step": 494 }, { "epoch": 0.20126041878430576, "grad_norm": 0.09365608543157578, "learning_rate": 0.00018009362914716062, "loss": 0.9504, "step": 495 }, { "epoch": 0.20166700548892053, "grad_norm": 0.09425178170204163, "learning_rate": 0.00018005292082230818, "loss": 1.0038, "step": 496 }, { "epoch": 0.20207359219353527, "grad_norm": 0.09562011808156967, "learning_rate": 0.00018001221249745576, "loss": 1.0877, "step": 497 }, { "epoch": 0.20248017889815004, "grad_norm": 0.11452426016330719, "learning_rate": 0.0001799715041726033, "loss": 1.0688, "step": 498 }, { "epoch": 0.20288676560276478, "grad_norm": 0.0930696651339531, "learning_rate": 0.00017993079584775087, "loss": 1.0255, "step": 499 }, { "epoch": 0.20329335230737955, "grad_norm": 0.10522327572107315, "learning_rate": 0.00017989008752289845, "loss": 1.085, "step": 500 } ], "logging_steps": 1, "max_steps": 4918, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5824065174102671e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }