{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2094, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004775549188156638, "grad_norm": 6.101418972015381, "learning_rate": 0.0, "loss": 1.9495, "step": 1 }, { "epoch": 0.0009551098376313276, "grad_norm": 5.78735876083374, "learning_rate": 7.936507936507937e-08, "loss": 1.5757, "step": 2 }, { "epoch": 0.0014326647564469914, "grad_norm": 5.225668430328369, "learning_rate": 1.5873015873015874e-07, "loss": 1.7423, "step": 3 }, { "epoch": 0.0019102196752626551, "grad_norm": 5.929269313812256, "learning_rate": 2.3809523809523811e-07, "loss": 1.9347, "step": 4 }, { "epoch": 0.002387774594078319, "grad_norm": 5.780717372894287, "learning_rate": 3.174603174603175e-07, "loss": 1.7121, "step": 5 }, { "epoch": 0.0028653295128939827, "grad_norm": 5.32647180557251, "learning_rate": 3.9682539682539683e-07, "loss": 1.2609, "step": 6 }, { "epoch": 0.0033428844317096467, "grad_norm": 5.921090602874756, "learning_rate": 4.7619047619047623e-07, "loss": 2.1124, "step": 7 }, { "epoch": 0.0038204393505253103, "grad_norm": 6.8896484375, "learning_rate": 5.555555555555555e-07, "loss": 2.102, "step": 8 }, { "epoch": 0.004297994269340974, "grad_norm": 6.178328514099121, "learning_rate": 6.34920634920635e-07, "loss": 1.5818, "step": 9 }, { "epoch": 0.004775549188156638, "grad_norm": 6.984008312225342, "learning_rate": 7.142857142857143e-07, "loss": 1.0676, "step": 10 }, { "epoch": 0.0052531041069723014, "grad_norm": 5.935889720916748, "learning_rate": 7.936507936507937e-07, "loss": 1.9521, "step": 11 }, { "epoch": 0.0057306590257879654, "grad_norm": 6.256581783294678, "learning_rate": 8.73015873015873e-07, "loss": 2.1447, "step": 12 }, { "epoch": 0.0062082139446036294, "grad_norm": 4.7052178382873535, "learning_rate": 9.523809523809525e-07, "loss": 2.2073, "step": 13 }, { "epoch": 0.0066857688634192934, "grad_norm": 5.936465740203857, "learning_rate": 1.0317460317460317e-06, "loss": 1.3949, "step": 14 }, { "epoch": 0.0071633237822349575, "grad_norm": 5.717200756072998, "learning_rate": 1.111111111111111e-06, "loss": 1.7237, "step": 15 }, { "epoch": 0.007640878701050621, "grad_norm": 4.9925971031188965, "learning_rate": 1.1904761904761906e-06, "loss": 1.8604, "step": 16 }, { "epoch": 0.008118433619866285, "grad_norm": 5.326063632965088, "learning_rate": 1.26984126984127e-06, "loss": 1.135, "step": 17 }, { "epoch": 0.008595988538681949, "grad_norm": 6.284627914428711, "learning_rate": 1.3492063492063493e-06, "loss": 1.2629, "step": 18 }, { "epoch": 0.009073543457497613, "grad_norm": 6.064401626586914, "learning_rate": 1.4285714285714286e-06, "loss": 1.421, "step": 19 }, { "epoch": 0.009551098376313277, "grad_norm": 5.221789836883545, "learning_rate": 1.507936507936508e-06, "loss": 2.1131, "step": 20 }, { "epoch": 0.01002865329512894, "grad_norm": 6.2354230880737305, "learning_rate": 1.5873015873015873e-06, "loss": 0.9273, "step": 21 }, { "epoch": 0.010506208213944603, "grad_norm": 5.157545566558838, "learning_rate": 1.6666666666666667e-06, "loss": 1.4774, "step": 22 }, { "epoch": 0.010983763132760267, "grad_norm": 5.837510585784912, "learning_rate": 1.746031746031746e-06, "loss": 1.6264, "step": 23 }, { "epoch": 0.011461318051575931, "grad_norm": 6.118061542510986, "learning_rate": 1.8253968253968254e-06, "loss": 1.769, "step": 24 }, { "epoch": 0.011938872970391595, "grad_norm": 5.913543224334717, "learning_rate": 1.904761904761905e-06, "loss": 1.3777, "step": 25 }, { "epoch": 0.012416427889207259, "grad_norm": 5.211728572845459, "learning_rate": 1.984126984126984e-06, "loss": 2.3669, "step": 26 }, { "epoch": 0.012893982808022923, "grad_norm": 4.883838176727295, "learning_rate": 2.0634920634920634e-06, "loss": 1.1752, "step": 27 }, { "epoch": 0.013371537726838587, "grad_norm": 5.117195129394531, "learning_rate": 2.1428571428571427e-06, "loss": 1.4877, "step": 28 }, { "epoch": 0.013849092645654251, "grad_norm": 4.2314348220825195, "learning_rate": 2.222222222222222e-06, "loss": 1.3085, "step": 29 }, { "epoch": 0.014326647564469915, "grad_norm": 5.82399845123291, "learning_rate": 2.301587301587302e-06, "loss": 1.5445, "step": 30 }, { "epoch": 0.014804202483285577, "grad_norm": 5.519792079925537, "learning_rate": 2.380952380952381e-06, "loss": 1.5183, "step": 31 }, { "epoch": 0.015281757402101241, "grad_norm": 5.116567611694336, "learning_rate": 2.4603174603174605e-06, "loss": 2.6434, "step": 32 }, { "epoch": 0.015759312320916905, "grad_norm": 6.218582630157471, "learning_rate": 2.53968253968254e-06, "loss": 2.2786, "step": 33 }, { "epoch": 0.01623686723973257, "grad_norm": 4.389199733734131, "learning_rate": 2.6190476190476192e-06, "loss": 0.984, "step": 34 }, { "epoch": 0.016714422158548233, "grad_norm": 5.397778034210205, "learning_rate": 2.6984126984126986e-06, "loss": 1.8271, "step": 35 }, { "epoch": 0.017191977077363897, "grad_norm": 4.634466171264648, "learning_rate": 2.7777777777777783e-06, "loss": 1.803, "step": 36 }, { "epoch": 0.01766953199617956, "grad_norm": 5.947023391723633, "learning_rate": 2.8571428571428573e-06, "loss": 1.2389, "step": 37 }, { "epoch": 0.018147086914995225, "grad_norm": 4.787606239318848, "learning_rate": 2.936507936507937e-06, "loss": 1.7263, "step": 38 }, { "epoch": 0.01862464183381089, "grad_norm": 5.475992679595947, "learning_rate": 3.015873015873016e-06, "loss": 1.8315, "step": 39 }, { "epoch": 0.019102196752626553, "grad_norm": 4.8906636238098145, "learning_rate": 3.0952380952380957e-06, "loss": 1.0534, "step": 40 }, { "epoch": 0.019579751671442217, "grad_norm": 5.312828540802002, "learning_rate": 3.1746031746031746e-06, "loss": 1.5511, "step": 41 }, { "epoch": 0.02005730659025788, "grad_norm": 6.301405906677246, "learning_rate": 3.2539682539682544e-06, "loss": 1.2797, "step": 42 }, { "epoch": 0.020534861509073542, "grad_norm": 4.789735317230225, "learning_rate": 3.3333333333333333e-06, "loss": 1.6392, "step": 43 }, { "epoch": 0.021012416427889206, "grad_norm": 5.52883768081665, "learning_rate": 3.412698412698413e-06, "loss": 1.823, "step": 44 }, { "epoch": 0.02148997134670487, "grad_norm": 4.99170446395874, "learning_rate": 3.492063492063492e-06, "loss": 1.9603, "step": 45 }, { "epoch": 0.021967526265520534, "grad_norm": 4.0474853515625, "learning_rate": 3.5714285714285718e-06, "loss": 1.0749, "step": 46 }, { "epoch": 0.022445081184336198, "grad_norm": 5.696434497833252, "learning_rate": 3.6507936507936507e-06, "loss": 2.0317, "step": 47 }, { "epoch": 0.022922636103151862, "grad_norm": 5.886493682861328, "learning_rate": 3.7301587301587305e-06, "loss": 1.6794, "step": 48 }, { "epoch": 0.023400191021967526, "grad_norm": 5.608858108520508, "learning_rate": 3.80952380952381e-06, "loss": 2.1166, "step": 49 }, { "epoch": 0.02387774594078319, "grad_norm": 4.358663558959961, "learning_rate": 3.88888888888889e-06, "loss": 1.251, "step": 50 }, { "epoch": 0.024355300859598854, "grad_norm": 5.156405448913574, "learning_rate": 3.968253968253968e-06, "loss": 1.2668, "step": 51 }, { "epoch": 0.024832855778414518, "grad_norm": 5.062716484069824, "learning_rate": 4.047619047619048e-06, "loss": 1.6216, "step": 52 }, { "epoch": 0.025310410697230182, "grad_norm": 4.701766014099121, "learning_rate": 4.126984126984127e-06, "loss": 0.9103, "step": 53 }, { "epoch": 0.025787965616045846, "grad_norm": 4.0987868309021, "learning_rate": 4.206349206349207e-06, "loss": 0.8618, "step": 54 }, { "epoch": 0.02626552053486151, "grad_norm": 5.392373561859131, "learning_rate": 4.2857142857142855e-06, "loss": 2.0938, "step": 55 }, { "epoch": 0.026743075453677174, "grad_norm": 4.447094917297363, "learning_rate": 4.365079365079366e-06, "loss": 0.8346, "step": 56 }, { "epoch": 0.027220630372492838, "grad_norm": 4.576540946960449, "learning_rate": 4.444444444444444e-06, "loss": 1.257, "step": 57 }, { "epoch": 0.027698185291308502, "grad_norm": 5.824233531951904, "learning_rate": 4.523809523809524e-06, "loss": 1.8751, "step": 58 }, { "epoch": 0.028175740210124166, "grad_norm": 4.178110122680664, "learning_rate": 4.603174603174604e-06, "loss": 1.7, "step": 59 }, { "epoch": 0.02865329512893983, "grad_norm": 4.734243392944336, "learning_rate": 4.682539682539683e-06, "loss": 1.345, "step": 60 }, { "epoch": 0.02913085004775549, "grad_norm": 5.72996711730957, "learning_rate": 4.761904761904762e-06, "loss": 1.6658, "step": 61 }, { "epoch": 0.029608404966571154, "grad_norm": 4.523313522338867, "learning_rate": 4.841269841269842e-06, "loss": 1.6422, "step": 62 }, { "epoch": 0.03008595988538682, "grad_norm": 5.694447040557861, "learning_rate": 4.920634920634921e-06, "loss": 1.3574, "step": 63 }, { "epoch": 0.030563514804202482, "grad_norm": 4.136141300201416, "learning_rate": 5e-06, "loss": 1.345, "step": 64 }, { "epoch": 0.031041069723018146, "grad_norm": 5.671857833862305, "learning_rate": 4.999997009183107e-06, "loss": 1.0629, "step": 65 }, { "epoch": 0.03151862464183381, "grad_norm": 5.126098155975342, "learning_rate": 4.999988036739585e-06, "loss": 1.4113, "step": 66 }, { "epoch": 0.03199617956064948, "grad_norm": 4.482530117034912, "learning_rate": 4.9999730826909e-06, "loss": 1.3937, "step": 67 }, { "epoch": 0.03247373447946514, "grad_norm": 4.283955097198486, "learning_rate": 4.999952147072833e-06, "loss": 1.715, "step": 68 }, { "epoch": 0.0329512893982808, "grad_norm": 5.417228698730469, "learning_rate": 4.999925229935476e-06, "loss": 1.4776, "step": 69 }, { "epoch": 0.033428844317096466, "grad_norm": 6.831479072570801, "learning_rate": 4.999892331343231e-06, "loss": 1.4479, "step": 70 }, { "epoch": 0.03390639923591213, "grad_norm": 4.864286422729492, "learning_rate": 4.9998534513748155e-06, "loss": 1.1168, "step": 71 }, { "epoch": 0.034383954154727794, "grad_norm": 4.8280110359191895, "learning_rate": 4.999808590123253e-06, "loss": 1.9099, "step": 72 }, { "epoch": 0.034861509073543455, "grad_norm": 4.651793003082275, "learning_rate": 4.9997577476958826e-06, "loss": 1.2463, "step": 73 }, { "epoch": 0.03533906399235912, "grad_norm": 5.205795764923096, "learning_rate": 4.999700924214352e-06, "loss": 1.5833, "step": 74 }, { "epoch": 0.03581661891117478, "grad_norm": 4.962429046630859, "learning_rate": 4.999638119814621e-06, "loss": 1.0308, "step": 75 }, { "epoch": 0.03629417382999045, "grad_norm": 4.903221607208252, "learning_rate": 4.9995693346469565e-06, "loss": 1.1609, "step": 76 }, { "epoch": 0.03677172874880611, "grad_norm": 4.809661865234375, "learning_rate": 4.99949456887594e-06, "loss": 1.8421, "step": 77 }, { "epoch": 0.03724928366762178, "grad_norm": 5.848609924316406, "learning_rate": 4.999413822680459e-06, "loss": 1.7753, "step": 78 }, { "epoch": 0.03772683858643744, "grad_norm": 4.257289886474609, "learning_rate": 4.999327096253711e-06, "loss": 1.466, "step": 79 }, { "epoch": 0.038204393505253106, "grad_norm": 4.985309600830078, "learning_rate": 4.999234389803204e-06, "loss": 1.1358, "step": 80 }, { "epoch": 0.03868194842406877, "grad_norm": 5.229691982269287, "learning_rate": 4.999135703550749e-06, "loss": 0.9994, "step": 81 }, { "epoch": 0.039159503342884434, "grad_norm": 5.800382614135742, "learning_rate": 4.9990310377324715e-06, "loss": 1.3964, "step": 82 }, { "epoch": 0.039637058261700095, "grad_norm": 5.156551361083984, "learning_rate": 4.998920392598799e-06, "loss": 1.2042, "step": 83 }, { "epoch": 0.04011461318051576, "grad_norm": 5.556703090667725, "learning_rate": 4.998803768414467e-06, "loss": 1.5144, "step": 84 }, { "epoch": 0.04059216809933142, "grad_norm": 5.32941198348999, "learning_rate": 4.998681165458516e-06, "loss": 0.9003, "step": 85 }, { "epoch": 0.041069723018147083, "grad_norm": 5.097962379455566, "learning_rate": 4.9985525840242944e-06, "loss": 1.5078, "step": 86 }, { "epoch": 0.04154727793696275, "grad_norm": 5.125922203063965, "learning_rate": 4.998418024419451e-06, "loss": 1.7832, "step": 87 }, { "epoch": 0.04202483285577841, "grad_norm": 5.344782829284668, "learning_rate": 4.998277486965942e-06, "loss": 1.6419, "step": 88 }, { "epoch": 0.04250238777459408, "grad_norm": 4.729647159576416, "learning_rate": 4.9981309720000235e-06, "loss": 1.1209, "step": 89 }, { "epoch": 0.04297994269340974, "grad_norm": 4.224547863006592, "learning_rate": 4.997978479872255e-06, "loss": 1.2076, "step": 90 }, { "epoch": 0.04345749761222541, "grad_norm": 4.8836445808410645, "learning_rate": 4.997820010947498e-06, "loss": 0.8808, "step": 91 }, { "epoch": 0.04393505253104107, "grad_norm": 5.1383490562438965, "learning_rate": 4.997655565604914e-06, "loss": 2.236, "step": 92 }, { "epoch": 0.044412607449856735, "grad_norm": 5.865267753601074, "learning_rate": 4.997485144237963e-06, "loss": 0.9929, "step": 93 }, { "epoch": 0.044890162368672396, "grad_norm": 4.785399913787842, "learning_rate": 4.9973087472544056e-06, "loss": 1.1141, "step": 94 }, { "epoch": 0.04536771728748806, "grad_norm": 5.55067777633667, "learning_rate": 4.997126375076297e-06, "loss": 1.3387, "step": 95 }, { "epoch": 0.045845272206303724, "grad_norm": 4.873939037322998, "learning_rate": 4.9969380281399915e-06, "loss": 1.0762, "step": 96 }, { "epoch": 0.04632282712511939, "grad_norm": 5.75584602355957, "learning_rate": 4.996743706896138e-06, "loss": 1.5257, "step": 97 }, { "epoch": 0.04680038204393505, "grad_norm": 4.390905857086182, "learning_rate": 4.996543411809679e-06, "loss": 1.7533, "step": 98 }, { "epoch": 0.04727793696275072, "grad_norm": 5.087616920471191, "learning_rate": 4.9963371433598514e-06, "loss": 1.5164, "step": 99 }, { "epoch": 0.04775549188156638, "grad_norm": 3.521540641784668, "learning_rate": 4.996124902040186e-06, "loss": 0.6797, "step": 100 }, { "epoch": 0.04823304680038205, "grad_norm": 7.4073052406311035, "learning_rate": 4.995906688358502e-06, "loss": 1.4551, "step": 101 }, { "epoch": 0.04871060171919771, "grad_norm": 4.6319260597229, "learning_rate": 4.995682502836907e-06, "loss": 1.9687, "step": 102 }, { "epoch": 0.04918815663801337, "grad_norm": 4.386704921722412, "learning_rate": 4.995452346011802e-06, "loss": 1.708, "step": 103 }, { "epoch": 0.049665711556829036, "grad_norm": 4.961048603057861, "learning_rate": 4.995216218433871e-06, "loss": 1.6005, "step": 104 }, { "epoch": 0.050143266475644696, "grad_norm": 4.643105506896973, "learning_rate": 4.994974120668086e-06, "loss": 1.3241, "step": 105 }, { "epoch": 0.050620821394460364, "grad_norm": 4.699398994445801, "learning_rate": 4.994726053293703e-06, "loss": 1.7865, "step": 106 }, { "epoch": 0.051098376313276024, "grad_norm": 5.065433979034424, "learning_rate": 4.994472016904261e-06, "loss": 1.7091, "step": 107 }, { "epoch": 0.05157593123209169, "grad_norm": 4.726767063140869, "learning_rate": 4.994212012107581e-06, "loss": 1.9314, "step": 108 }, { "epoch": 0.05205348615090735, "grad_norm": 3.989816904067993, "learning_rate": 4.993946039525766e-06, "loss": 0.8678, "step": 109 }, { "epoch": 0.05253104106972302, "grad_norm": 4.186660289764404, "learning_rate": 4.993674099795194e-06, "loss": 0.8135, "step": 110 }, { "epoch": 0.05300859598853868, "grad_norm": 4.623104572296143, "learning_rate": 4.993396193566523e-06, "loss": 1.3998, "step": 111 }, { "epoch": 0.05348615090735435, "grad_norm": 4.699170112609863, "learning_rate": 4.9931123215046874e-06, "loss": 1.3495, "step": 112 }, { "epoch": 0.05396370582617001, "grad_norm": 5.039900779724121, "learning_rate": 4.992822484288895e-06, "loss": 1.6361, "step": 113 }, { "epoch": 0.054441260744985676, "grad_norm": 4.3773603439331055, "learning_rate": 4.9925266826126234e-06, "loss": 1.3456, "step": 114 }, { "epoch": 0.054918815663801336, "grad_norm": 3.65704345703125, "learning_rate": 4.992224917183626e-06, "loss": 0.9858, "step": 115 }, { "epoch": 0.055396370582617004, "grad_norm": 4.058018684387207, "learning_rate": 4.9919171887239215e-06, "loss": 1.742, "step": 116 }, { "epoch": 0.055873925501432664, "grad_norm": 4.8775529861450195, "learning_rate": 4.991603497969798e-06, "loss": 0.826, "step": 117 }, { "epoch": 0.05635148042024833, "grad_norm": 4.442937850952148, "learning_rate": 4.991283845671809e-06, "loss": 1.6756, "step": 118 }, { "epoch": 0.05682903533906399, "grad_norm": 5.879728317260742, "learning_rate": 4.990958232594771e-06, "loss": 1.0554, "step": 119 }, { "epoch": 0.05730659025787966, "grad_norm": 4.598890781402588, "learning_rate": 4.990626659517764e-06, "loss": 1.4488, "step": 120 }, { "epoch": 0.05778414517669532, "grad_norm": 5.324604034423828, "learning_rate": 4.990289127234126e-06, "loss": 1.3197, "step": 121 }, { "epoch": 0.05826170009551098, "grad_norm": 6.066585063934326, "learning_rate": 4.989945636551457e-06, "loss": 2.241, "step": 122 }, { "epoch": 0.05873925501432665, "grad_norm": 5.337082862854004, "learning_rate": 4.98959618829161e-06, "loss": 1.1494, "step": 123 }, { "epoch": 0.05921680993314231, "grad_norm": 4.894677639007568, "learning_rate": 4.989240783290694e-06, "loss": 2.1218, "step": 124 }, { "epoch": 0.059694364851957976, "grad_norm": 3.927826404571533, "learning_rate": 4.98887942239907e-06, "loss": 1.3494, "step": 125 }, { "epoch": 0.06017191977077364, "grad_norm": 4.503044605255127, "learning_rate": 4.988512106481349e-06, "loss": 1.3726, "step": 126 }, { "epoch": 0.060649474689589304, "grad_norm": 5.601580619812012, "learning_rate": 4.988138836416391e-06, "loss": 1.9082, "step": 127 }, { "epoch": 0.061127029608404965, "grad_norm": 4.234355449676514, "learning_rate": 4.987759613097301e-06, "loss": 1.0469, "step": 128 }, { "epoch": 0.06160458452722063, "grad_norm": 5.523721218109131, "learning_rate": 4.9873744374314305e-06, "loss": 1.1237, "step": 129 }, { "epoch": 0.06208213944603629, "grad_norm": 4.663069248199463, "learning_rate": 4.9869833103403705e-06, "loss": 1.6524, "step": 130 }, { "epoch": 0.06255969436485195, "grad_norm": 5.6707892417907715, "learning_rate": 4.986586232759954e-06, "loss": 1.5455, "step": 131 }, { "epoch": 0.06303724928366762, "grad_norm": 3.6976113319396973, "learning_rate": 4.986183205640247e-06, "loss": 1.147, "step": 132 }, { "epoch": 0.06351480420248329, "grad_norm": 4.708729267120361, "learning_rate": 4.985774229945557e-06, "loss": 1.9132, "step": 133 }, { "epoch": 0.06399235912129896, "grad_norm": 4.940396308898926, "learning_rate": 4.98535930665442e-06, "loss": 1.5065, "step": 134 }, { "epoch": 0.06446991404011461, "grad_norm": 5.518232345581055, "learning_rate": 4.9849384367596035e-06, "loss": 1.1002, "step": 135 }, { "epoch": 0.06494746895893028, "grad_norm": 5.127005100250244, "learning_rate": 4.984511621268103e-06, "loss": 1.3115, "step": 136 }, { "epoch": 0.06542502387774594, "grad_norm": 4.6984148025512695, "learning_rate": 4.98407886120114e-06, "loss": 0.6178, "step": 137 }, { "epoch": 0.0659025787965616, "grad_norm": 5.926209926605225, "learning_rate": 4.9836401575941605e-06, "loss": 1.801, "step": 138 }, { "epoch": 0.06638013371537727, "grad_norm": 5.292475700378418, "learning_rate": 4.983195511496829e-06, "loss": 1.3552, "step": 139 }, { "epoch": 0.06685768863419293, "grad_norm": 5.853498458862305, "learning_rate": 4.982744923973031e-06, "loss": 1.1528, "step": 140 }, { "epoch": 0.0673352435530086, "grad_norm": 5.3796067237854, "learning_rate": 4.982288396100864e-06, "loss": 1.5831, "step": 141 }, { "epoch": 0.06781279847182425, "grad_norm": 5.46683406829834, "learning_rate": 4.981825928972643e-06, "loss": 1.3005, "step": 142 }, { "epoch": 0.06829035339063992, "grad_norm": 5.002374649047852, "learning_rate": 4.981357523694892e-06, "loss": 1.5159, "step": 143 }, { "epoch": 0.06876790830945559, "grad_norm": 4.638329029083252, "learning_rate": 4.980883181388341e-06, "loss": 1.7726, "step": 144 }, { "epoch": 0.06924546322827126, "grad_norm": 5.831549167633057, "learning_rate": 4.980402903187927e-06, "loss": 1.236, "step": 145 }, { "epoch": 0.06972301814708691, "grad_norm": 5.163804531097412, "learning_rate": 4.97991669024279e-06, "loss": 2.5005, "step": 146 }, { "epoch": 0.07020057306590258, "grad_norm": 5.958655834197998, "learning_rate": 4.979424543716268e-06, "loss": 0.9701, "step": 147 }, { "epoch": 0.07067812798471824, "grad_norm": 4.583427429199219, "learning_rate": 4.978926464785899e-06, "loss": 1.5727, "step": 148 }, { "epoch": 0.07115568290353391, "grad_norm": 4.3044023513793945, "learning_rate": 4.978422454643412e-06, "loss": 1.517, "step": 149 }, { "epoch": 0.07163323782234957, "grad_norm": 4.533758163452148, "learning_rate": 4.977912514494728e-06, "loss": 1.9422, "step": 150 }, { "epoch": 0.07211079274116523, "grad_norm": 4.9447197914123535, "learning_rate": 4.977396645559959e-06, "loss": 1.6424, "step": 151 }, { "epoch": 0.0725883476599809, "grad_norm": 5.7221455574035645, "learning_rate": 4.976874849073399e-06, "loss": 1.9361, "step": 152 }, { "epoch": 0.07306590257879657, "grad_norm": 4.960414409637451, "learning_rate": 4.9763471262835275e-06, "loss": 1.4002, "step": 153 }, { "epoch": 0.07354345749761222, "grad_norm": 5.1127610206604, "learning_rate": 4.975813478453001e-06, "loss": 1.4899, "step": 154 }, { "epoch": 0.07402101241642789, "grad_norm": 5.4191718101501465, "learning_rate": 4.975273906858654e-06, "loss": 1.0944, "step": 155 }, { "epoch": 0.07449856733524356, "grad_norm": 5.274662494659424, "learning_rate": 4.974728412791495e-06, "loss": 1.5013, "step": 156 }, { "epoch": 0.07497612225405921, "grad_norm": 5.863588809967041, "learning_rate": 4.974176997556702e-06, "loss": 1.2969, "step": 157 }, { "epoch": 0.07545367717287488, "grad_norm": 4.529082298278809, "learning_rate": 4.973619662473621e-06, "loss": 1.2704, "step": 158 }, { "epoch": 0.07593123209169055, "grad_norm": 4.601380348205566, "learning_rate": 4.973056408875762e-06, "loss": 1.2769, "step": 159 }, { "epoch": 0.07640878701050621, "grad_norm": 5.114466667175293, "learning_rate": 4.9724872381107935e-06, "loss": 1.8818, "step": 160 }, { "epoch": 0.07688634192932187, "grad_norm": 4.554062843322754, "learning_rate": 4.971912151540546e-06, "loss": 1.1194, "step": 161 }, { "epoch": 0.07736389684813753, "grad_norm": 5.810101509094238, "learning_rate": 4.971331150541003e-06, "loss": 1.7968, "step": 162 }, { "epoch": 0.0778414517669532, "grad_norm": 6.099402904510498, "learning_rate": 4.970744236502297e-06, "loss": 1.2298, "step": 163 }, { "epoch": 0.07831900668576887, "grad_norm": 4.690209865570068, "learning_rate": 4.970151410828711e-06, "loss": 1.175, "step": 164 }, { "epoch": 0.07879656160458452, "grad_norm": 4.6680145263671875, "learning_rate": 4.96955267493867e-06, "loss": 1.4182, "step": 165 }, { "epoch": 0.07927411652340019, "grad_norm": 4.70673131942749, "learning_rate": 4.968948030264743e-06, "loss": 1.0488, "step": 166 }, { "epoch": 0.07975167144221586, "grad_norm": 4.7446064949035645, "learning_rate": 4.968337478253634e-06, "loss": 1.5643, "step": 167 }, { "epoch": 0.08022922636103152, "grad_norm": 4.622127056121826, "learning_rate": 4.967721020366183e-06, "loss": 0.7721, "step": 168 }, { "epoch": 0.08070678127984718, "grad_norm": 6.232114315032959, "learning_rate": 4.967098658077361e-06, "loss": 2.0223, "step": 169 }, { "epoch": 0.08118433619866285, "grad_norm": 5.461650848388672, "learning_rate": 4.966470392876264e-06, "loss": 2.379, "step": 170 }, { "epoch": 0.08166189111747851, "grad_norm": 5.0958685874938965, "learning_rate": 4.965836226266114e-06, "loss": 1.6191, "step": 171 }, { "epoch": 0.08213944603629417, "grad_norm": 4.17983865737915, "learning_rate": 4.96519615976425e-06, "loss": 1.7952, "step": 172 }, { "epoch": 0.08261700095510983, "grad_norm": 5.576516628265381, "learning_rate": 4.964550194902133e-06, "loss": 1.7162, "step": 173 }, { "epoch": 0.0830945558739255, "grad_norm": 5.472452163696289, "learning_rate": 4.963898333225328e-06, "loss": 1.4238, "step": 174 }, { "epoch": 0.08357211079274117, "grad_norm": 5.126534938812256, "learning_rate": 4.96324057629352e-06, "loss": 1.1392, "step": 175 }, { "epoch": 0.08404966571155682, "grad_norm": 5.099165439605713, "learning_rate": 4.962576925680489e-06, "loss": 1.5051, "step": 176 }, { "epoch": 0.08452722063037249, "grad_norm": 7.106988430023193, "learning_rate": 4.961907382974122e-06, "loss": 1.4086, "step": 177 }, { "epoch": 0.08500477554918816, "grad_norm": 5.618509769439697, "learning_rate": 4.961231949776404e-06, "loss": 2.0452, "step": 178 }, { "epoch": 0.08548233046800383, "grad_norm": 4.534405708312988, "learning_rate": 4.960550627703411e-06, "loss": 1.6481, "step": 179 }, { "epoch": 0.08595988538681948, "grad_norm": 5.307653427124023, "learning_rate": 4.959863418385312e-06, "loss": 1.4031, "step": 180 }, { "epoch": 0.08643744030563515, "grad_norm": 5.164826393127441, "learning_rate": 4.959170323466359e-06, "loss": 1.2122, "step": 181 }, { "epoch": 0.08691499522445081, "grad_norm": 4.44277286529541, "learning_rate": 4.95847134460489e-06, "loss": 1.4058, "step": 182 }, { "epoch": 0.08739255014326648, "grad_norm": 5.056310653686523, "learning_rate": 4.957766483473319e-06, "loss": 1.7548, "step": 183 }, { "epoch": 0.08787010506208214, "grad_norm": 5.1423563957214355, "learning_rate": 4.957055741758133e-06, "loss": 1.1351, "step": 184 }, { "epoch": 0.0883476599808978, "grad_norm": 4.953139781951904, "learning_rate": 4.956339121159892e-06, "loss": 1.1075, "step": 185 }, { "epoch": 0.08882521489971347, "grad_norm": 5.1795759201049805, "learning_rate": 4.95561662339322e-06, "loss": 2.0896, "step": 186 }, { "epoch": 0.08930276981852914, "grad_norm": 4.626502513885498, "learning_rate": 4.954888250186805e-06, "loss": 1.3992, "step": 187 }, { "epoch": 0.08978032473734479, "grad_norm": 4.689163684844971, "learning_rate": 4.95415400328339e-06, "loss": 1.1589, "step": 188 }, { "epoch": 0.09025787965616046, "grad_norm": 5.338560104370117, "learning_rate": 4.953413884439774e-06, "loss": 1.515, "step": 189 }, { "epoch": 0.09073543457497613, "grad_norm": 5.383810043334961, "learning_rate": 4.952667895426806e-06, "loss": 1.9171, "step": 190 }, { "epoch": 0.09121298949379178, "grad_norm": 5.899359226226807, "learning_rate": 4.951916038029378e-06, "loss": 1.3597, "step": 191 }, { "epoch": 0.09169054441260745, "grad_norm": 5.346579551696777, "learning_rate": 4.9511583140464245e-06, "loss": 1.6146, "step": 192 }, { "epoch": 0.09216809933142311, "grad_norm": 5.365161895751953, "learning_rate": 4.950394725290918e-06, "loss": 1.7025, "step": 193 }, { "epoch": 0.09264565425023878, "grad_norm": 6.30120849609375, "learning_rate": 4.949625273589859e-06, "loss": 1.329, "step": 194 }, { "epoch": 0.09312320916905444, "grad_norm": 4.803522109985352, "learning_rate": 4.948849960784281e-06, "loss": 1.3173, "step": 195 }, { "epoch": 0.0936007640878701, "grad_norm": 4.635858535766602, "learning_rate": 4.948068788729238e-06, "loss": 1.6602, "step": 196 }, { "epoch": 0.09407831900668577, "grad_norm": 3.8177881240844727, "learning_rate": 4.947281759293804e-06, "loss": 0.823, "step": 197 }, { "epoch": 0.09455587392550144, "grad_norm": 4.487942695617676, "learning_rate": 4.946488874361068e-06, "loss": 1.3905, "step": 198 }, { "epoch": 0.09503342884431709, "grad_norm": 4.421664714813232, "learning_rate": 4.945690135828129e-06, "loss": 1.4032, "step": 199 }, { "epoch": 0.09551098376313276, "grad_norm": 4.888106346130371, "learning_rate": 4.944885545606093e-06, "loss": 1.4917, "step": 200 }, { "epoch": 0.09598853868194843, "grad_norm": 4.206897735595703, "learning_rate": 4.9440751056200625e-06, "loss": 1.2527, "step": 201 }, { "epoch": 0.0964660936007641, "grad_norm": 5.189200401306152, "learning_rate": 4.9432588178091415e-06, "loss": 0.9863, "step": 202 }, { "epoch": 0.09694364851957975, "grad_norm": 4.425311088562012, "learning_rate": 4.942436684126424e-06, "loss": 1.6279, "step": 203 }, { "epoch": 0.09742120343839542, "grad_norm": 4.731985569000244, "learning_rate": 4.94160870653899e-06, "loss": 1.7609, "step": 204 }, { "epoch": 0.09789875835721108, "grad_norm": 4.9980387687683105, "learning_rate": 4.940774887027904e-06, "loss": 1.54, "step": 205 }, { "epoch": 0.09837631327602674, "grad_norm": 4.66156005859375, "learning_rate": 4.9399352275882075e-06, "loss": 1.0773, "step": 206 }, { "epoch": 0.0988538681948424, "grad_norm": 5.001552581787109, "learning_rate": 4.939089730228913e-06, "loss": 1.5568, "step": 207 }, { "epoch": 0.09933142311365807, "grad_norm": 4.227767467498779, "learning_rate": 4.938238396973003e-06, "loss": 1.2779, "step": 208 }, { "epoch": 0.09980897803247374, "grad_norm": 4.471835613250732, "learning_rate": 4.937381229857425e-06, "loss": 1.8588, "step": 209 }, { "epoch": 0.10028653295128939, "grad_norm": 5.748694896697998, "learning_rate": 4.936518230933081e-06, "loss": 1.1119, "step": 210 }, { "epoch": 0.10076408787010506, "grad_norm": 4.334205627441406, "learning_rate": 4.935649402264829e-06, "loss": 1.2238, "step": 211 }, { "epoch": 0.10124164278892073, "grad_norm": 4.055333614349365, "learning_rate": 4.934774745931475e-06, "loss": 0.8357, "step": 212 }, { "epoch": 0.1017191977077364, "grad_norm": 4.9426398277282715, "learning_rate": 4.933894264025768e-06, "loss": 1.4251, "step": 213 }, { "epoch": 0.10219675262655205, "grad_norm": 5.136143207550049, "learning_rate": 4.933007958654398e-06, "loss": 1.4529, "step": 214 }, { "epoch": 0.10267430754536772, "grad_norm": 5.425299167633057, "learning_rate": 4.932115831937984e-06, "loss": 2.3587, "step": 215 }, { "epoch": 0.10315186246418338, "grad_norm": 5.519898414611816, "learning_rate": 4.9312178860110785e-06, "loss": 1.6743, "step": 216 }, { "epoch": 0.10362941738299905, "grad_norm": 4.8241047859191895, "learning_rate": 4.930314123022153e-06, "loss": 0.9608, "step": 217 }, { "epoch": 0.1041069723018147, "grad_norm": 4.091524124145508, "learning_rate": 4.9294045451336e-06, "loss": 1.3433, "step": 218 }, { "epoch": 0.10458452722063037, "grad_norm": 4.3517537117004395, "learning_rate": 4.928489154521724e-06, "loss": 1.593, "step": 219 }, { "epoch": 0.10506208213944604, "grad_norm": 4.826793670654297, "learning_rate": 4.927567953376738e-06, "loss": 1.2569, "step": 220 }, { "epoch": 0.1055396370582617, "grad_norm": 4.634960174560547, "learning_rate": 4.926640943902759e-06, "loss": 1.457, "step": 221 }, { "epoch": 0.10601719197707736, "grad_norm": 4.15624475479126, "learning_rate": 4.925708128317796e-06, "loss": 0.7716, "step": 222 }, { "epoch": 0.10649474689589303, "grad_norm": 4.039787292480469, "learning_rate": 4.924769508853754e-06, "loss": 1.4869, "step": 223 }, { "epoch": 0.1069723018147087, "grad_norm": 4.451485633850098, "learning_rate": 4.923825087756426e-06, "loss": 1.4185, "step": 224 }, { "epoch": 0.10744985673352435, "grad_norm": 5.053978443145752, "learning_rate": 4.922874867285484e-06, "loss": 0.6975, "step": 225 }, { "epoch": 0.10792741165234002, "grad_norm": 4.334681034088135, "learning_rate": 4.921918849714475e-06, "loss": 1.7638, "step": 226 }, { "epoch": 0.10840496657115568, "grad_norm": 5.203692436218262, "learning_rate": 4.9209570373308205e-06, "loss": 1.1983, "step": 227 }, { "epoch": 0.10888252148997135, "grad_norm": 4.874248027801514, "learning_rate": 4.919989432435801e-06, "loss": 1.3461, "step": 228 }, { "epoch": 0.109360076408787, "grad_norm": 4.832522869110107, "learning_rate": 4.9190160373445614e-06, "loss": 1.5702, "step": 229 }, { "epoch": 0.10983763132760267, "grad_norm": 5.865232944488525, "learning_rate": 4.9180368543861e-06, "loss": 0.9399, "step": 230 }, { "epoch": 0.11031518624641834, "grad_norm": 4.2580766677856445, "learning_rate": 4.9170518859032604e-06, "loss": 1.1813, "step": 231 }, { "epoch": 0.11079274116523401, "grad_norm": 4.549014568328857, "learning_rate": 4.916061134252731e-06, "loss": 1.7703, "step": 232 }, { "epoch": 0.11127029608404966, "grad_norm": 5.065488815307617, "learning_rate": 4.915064601805038e-06, "loss": 1.1812, "step": 233 }, { "epoch": 0.11174785100286533, "grad_norm": 4.995255947113037, "learning_rate": 4.914062290944539e-06, "loss": 1.5798, "step": 234 }, { "epoch": 0.112225405921681, "grad_norm": 5.23259162902832, "learning_rate": 4.913054204069414e-06, "loss": 1.6947, "step": 235 }, { "epoch": 0.11270296084049666, "grad_norm": 4.804322719573975, "learning_rate": 4.9120403435916695e-06, "loss": 1.4953, "step": 236 }, { "epoch": 0.11318051575931232, "grad_norm": 4.693699359893799, "learning_rate": 4.911020711937119e-06, "loss": 1.0236, "step": 237 }, { "epoch": 0.11365807067812798, "grad_norm": 4.6219706535339355, "learning_rate": 4.909995311545389e-06, "loss": 1.5154, "step": 238 }, { "epoch": 0.11413562559694365, "grad_norm": 4.397285461425781, "learning_rate": 4.908964144869907e-06, "loss": 1.2654, "step": 239 }, { "epoch": 0.11461318051575932, "grad_norm": 4.666012763977051, "learning_rate": 4.907927214377898e-06, "loss": 1.0518, "step": 240 }, { "epoch": 0.11509073543457497, "grad_norm": 5.979899883270264, "learning_rate": 4.906884522550377e-06, "loss": 1.1872, "step": 241 }, { "epoch": 0.11556829035339064, "grad_norm": 5.338172912597656, "learning_rate": 4.905836071882145e-06, "loss": 1.0138, "step": 242 }, { "epoch": 0.11604584527220631, "grad_norm": 5.004177093505859, "learning_rate": 4.904781864881781e-06, "loss": 1.8652, "step": 243 }, { "epoch": 0.11652340019102196, "grad_norm": 4.753640651702881, "learning_rate": 4.903721904071636e-06, "loss": 1.9782, "step": 244 }, { "epoch": 0.11700095510983763, "grad_norm": 5.038556098937988, "learning_rate": 4.90265619198783e-06, "loss": 1.0358, "step": 245 }, { "epoch": 0.1174785100286533, "grad_norm": 6.034836292266846, "learning_rate": 4.901584731180242e-06, "loss": 0.8058, "step": 246 }, { "epoch": 0.11795606494746896, "grad_norm": 4.519852638244629, "learning_rate": 4.9005075242125075e-06, "loss": 1.4624, "step": 247 }, { "epoch": 0.11843361986628462, "grad_norm": 5.028530597686768, "learning_rate": 4.899424573662009e-06, "loss": 1.6115, "step": 248 }, { "epoch": 0.11891117478510028, "grad_norm": 5.614325523376465, "learning_rate": 4.898335882119871e-06, "loss": 1.3893, "step": 249 }, { "epoch": 0.11938872970391595, "grad_norm": 5.318653583526611, "learning_rate": 4.897241452190957e-06, "loss": 0.8608, "step": 250 }, { "epoch": 0.11986628462273162, "grad_norm": 5.114054203033447, "learning_rate": 4.896141286493857e-06, "loss": 1.2027, "step": 251 }, { "epoch": 0.12034383954154727, "grad_norm": 4.785380840301514, "learning_rate": 4.8950353876608884e-06, "loss": 1.4431, "step": 252 }, { "epoch": 0.12082139446036294, "grad_norm": 4.833309650421143, "learning_rate": 4.893923758338081e-06, "loss": 1.3915, "step": 253 }, { "epoch": 0.12129894937917861, "grad_norm": 4.637689590454102, "learning_rate": 4.892806401185181e-06, "loss": 1.7677, "step": 254 }, { "epoch": 0.12177650429799428, "grad_norm": 5.712728977203369, "learning_rate": 4.8916833188756355e-06, "loss": 1.0804, "step": 255 }, { "epoch": 0.12225405921680993, "grad_norm": 4.799901962280273, "learning_rate": 4.890554514096592e-06, "loss": 2.0054, "step": 256 }, { "epoch": 0.1227316141356256, "grad_norm": 5.298776149749756, "learning_rate": 4.889419989548889e-06, "loss": 1.684, "step": 257 }, { "epoch": 0.12320916905444126, "grad_norm": 4.7607221603393555, "learning_rate": 4.888279747947051e-06, "loss": 1.1664, "step": 258 }, { "epoch": 0.12368672397325692, "grad_norm": 4.289896011352539, "learning_rate": 4.887133792019281e-06, "loss": 1.5862, "step": 259 }, { "epoch": 0.12416427889207259, "grad_norm": 5.622129917144775, "learning_rate": 4.8859821245074535e-06, "loss": 1.6711, "step": 260 }, { "epoch": 0.12464183381088825, "grad_norm": 5.613969326019287, "learning_rate": 4.884824748167111e-06, "loss": 1.5972, "step": 261 }, { "epoch": 0.1251193887297039, "grad_norm": 4.333314418792725, "learning_rate": 4.883661665767454e-06, "loss": 1.5716, "step": 262 }, { "epoch": 0.1255969436485196, "grad_norm": 4.791656017303467, "learning_rate": 4.882492880091335e-06, "loss": 2.1387, "step": 263 }, { "epoch": 0.12607449856733524, "grad_norm": 4.77053689956665, "learning_rate": 4.881318393935253e-06, "loss": 1.0895, "step": 264 }, { "epoch": 0.1265520534861509, "grad_norm": 4.1351518630981445, "learning_rate": 4.880138210109349e-06, "loss": 1.454, "step": 265 }, { "epoch": 0.12702960840496658, "grad_norm": 5.050997257232666, "learning_rate": 4.87895233143739e-06, "loss": 0.9877, "step": 266 }, { "epoch": 0.12750716332378223, "grad_norm": 5.589963436126709, "learning_rate": 4.877760760756776e-06, "loss": 1.5021, "step": 267 }, { "epoch": 0.1279847182425979, "grad_norm": 4.603752136230469, "learning_rate": 4.876563500918521e-06, "loss": 1.5854, "step": 268 }, { "epoch": 0.12846227316141356, "grad_norm": 5.326108932495117, "learning_rate": 4.8753605547872525e-06, "loss": 1.5056, "step": 269 }, { "epoch": 0.12893982808022922, "grad_norm": 4.36484432220459, "learning_rate": 4.874151925241206e-06, "loss": 1.8193, "step": 270 }, { "epoch": 0.1294173829990449, "grad_norm": 5.204397678375244, "learning_rate": 4.872937615172211e-06, "loss": 1.2815, "step": 271 }, { "epoch": 0.12989493791786055, "grad_norm": 3.917996883392334, "learning_rate": 4.871717627485693e-06, "loss": 0.821, "step": 272 }, { "epoch": 0.1303724928366762, "grad_norm": 4.995541572570801, "learning_rate": 4.870491965100658e-06, "loss": 1.2085, "step": 273 }, { "epoch": 0.1308500477554919, "grad_norm": 5.350761890411377, "learning_rate": 4.869260630949691e-06, "loss": 1.4301, "step": 274 }, { "epoch": 0.13132760267430754, "grad_norm": 5.659578800201416, "learning_rate": 4.868023627978949e-06, "loss": 1.537, "step": 275 }, { "epoch": 0.1318051575931232, "grad_norm": 6.485449314117432, "learning_rate": 4.866780959148153e-06, "loss": 1.0793, "step": 276 }, { "epoch": 0.13228271251193888, "grad_norm": 5.391608715057373, "learning_rate": 4.865532627430577e-06, "loss": 1.4805, "step": 277 }, { "epoch": 0.13276026743075453, "grad_norm": 5.541451454162598, "learning_rate": 4.864278635813046e-06, "loss": 1.775, "step": 278 }, { "epoch": 0.1332378223495702, "grad_norm": 4.753811836242676, "learning_rate": 4.863018987295927e-06, "loss": 1.0044, "step": 279 }, { "epoch": 0.13371537726838587, "grad_norm": 5.348643779754639, "learning_rate": 4.861753684893126e-06, "loss": 0.9305, "step": 280 }, { "epoch": 0.13419293218720152, "grad_norm": 4.054960250854492, "learning_rate": 4.860482731632069e-06, "loss": 1.2165, "step": 281 }, { "epoch": 0.1346704871060172, "grad_norm": 4.728293418884277, "learning_rate": 4.85920613055371e-06, "loss": 2.132, "step": 282 }, { "epoch": 0.13514804202483285, "grad_norm": 6.922840118408203, "learning_rate": 4.85792388471251e-06, "loss": 1.2579, "step": 283 }, { "epoch": 0.1356255969436485, "grad_norm": 5.142256736755371, "learning_rate": 4.856635997176442e-06, "loss": 1.9745, "step": 284 }, { "epoch": 0.1361031518624642, "grad_norm": 5.549637794494629, "learning_rate": 4.855342471026973e-06, "loss": 1.4383, "step": 285 }, { "epoch": 0.13658070678127984, "grad_norm": 4.780221462249756, "learning_rate": 4.854043309359063e-06, "loss": 1.1805, "step": 286 }, { "epoch": 0.13705826170009552, "grad_norm": 3.9922935962677, "learning_rate": 4.852738515281156e-06, "loss": 1.7544, "step": 287 }, { "epoch": 0.13753581661891118, "grad_norm": 5.544539928436279, "learning_rate": 4.851428091915173e-06, "loss": 1.0609, "step": 288 }, { "epoch": 0.13801337153772683, "grad_norm": 4.3900370597839355, "learning_rate": 4.850112042396501e-06, "loss": 1.2773, "step": 289 }, { "epoch": 0.1384909264565425, "grad_norm": 5.071688652038574, "learning_rate": 4.848790369873992e-06, "loss": 1.0565, "step": 290 }, { "epoch": 0.13896848137535817, "grad_norm": 4.534339427947998, "learning_rate": 4.84746307750995e-06, "loss": 1.3628, "step": 291 }, { "epoch": 0.13944603629417382, "grad_norm": 4.603456497192383, "learning_rate": 4.846130168480126e-06, "loss": 1.9501, "step": 292 }, { "epoch": 0.1399235912129895, "grad_norm": 5.039694786071777, "learning_rate": 4.844791645973707e-06, "loss": 1.3614, "step": 293 }, { "epoch": 0.14040114613180515, "grad_norm": 5.547624111175537, "learning_rate": 4.843447513193319e-06, "loss": 1.634, "step": 294 }, { "epoch": 0.1408787010506208, "grad_norm": 5.199423313140869, "learning_rate": 4.842097773355001e-06, "loss": 1.4479, "step": 295 }, { "epoch": 0.1413562559694365, "grad_norm": 4.512908935546875, "learning_rate": 4.840742429688216e-06, "loss": 1.4555, "step": 296 }, { "epoch": 0.14183381088825214, "grad_norm": 5.380749702453613, "learning_rate": 4.839381485435829e-06, "loss": 1.7777, "step": 297 }, { "epoch": 0.14231136580706782, "grad_norm": 4.400062561035156, "learning_rate": 4.8380149438541105e-06, "loss": 1.0243, "step": 298 }, { "epoch": 0.14278892072588348, "grad_norm": 4.837282657623291, "learning_rate": 4.8366428082127195e-06, "loss": 1.7497, "step": 299 }, { "epoch": 0.14326647564469913, "grad_norm": 5.005699157714844, "learning_rate": 4.835265081794702e-06, "loss": 1.9338, "step": 300 }, { "epoch": 0.1437440305635148, "grad_norm": 3.5950236320495605, "learning_rate": 4.833881767896479e-06, "loss": 0.8907, "step": 301 }, { "epoch": 0.14422158548233047, "grad_norm": 5.288702487945557, "learning_rate": 4.832492869827843e-06, "loss": 1.0847, "step": 302 }, { "epoch": 0.14469914040114612, "grad_norm": 6.165022850036621, "learning_rate": 4.831098390911944e-06, "loss": 1.7478, "step": 303 }, { "epoch": 0.1451766953199618, "grad_norm": 5.677140712738037, "learning_rate": 4.829698334485288e-06, "loss": 1.0826, "step": 304 }, { "epoch": 0.14565425023877746, "grad_norm": 3.914092779159546, "learning_rate": 4.828292703897725e-06, "loss": 1.2749, "step": 305 }, { "epoch": 0.14613180515759314, "grad_norm": 5.351346015930176, "learning_rate": 4.826881502512441e-06, "loss": 1.5014, "step": 306 }, { "epoch": 0.1466093600764088, "grad_norm": 4.246301651000977, "learning_rate": 4.8254647337059525e-06, "loss": 1.5642, "step": 307 }, { "epoch": 0.14708691499522444, "grad_norm": 5.288384437561035, "learning_rate": 4.8240424008680965e-06, "loss": 1.262, "step": 308 }, { "epoch": 0.14756446991404013, "grad_norm": 5.253522872924805, "learning_rate": 4.822614507402023e-06, "loss": 1.1455, "step": 309 }, { "epoch": 0.14804202483285578, "grad_norm": 5.358833312988281, "learning_rate": 4.821181056724185e-06, "loss": 1.7043, "step": 310 }, { "epoch": 0.14851957975167143, "grad_norm": 5.538655757904053, "learning_rate": 4.819742052264335e-06, "loss": 1.6029, "step": 311 }, { "epoch": 0.1489971346704871, "grad_norm": 5.242074489593506, "learning_rate": 4.81829749746551e-06, "loss": 1.6672, "step": 312 }, { "epoch": 0.14947468958930277, "grad_norm": 5.227020740509033, "learning_rate": 4.816847395784031e-06, "loss": 1.2486, "step": 313 }, { "epoch": 0.14995224450811842, "grad_norm": 4.6335225105285645, "learning_rate": 4.815391750689489e-06, "loss": 1.3467, "step": 314 }, { "epoch": 0.1504297994269341, "grad_norm": 5.450636863708496, "learning_rate": 4.8139305656647375e-06, "loss": 1.1066, "step": 315 }, { "epoch": 0.15090735434574976, "grad_norm": 5.111711502075195, "learning_rate": 4.8124638442058856e-06, "loss": 1.6462, "step": 316 }, { "epoch": 0.15138490926456544, "grad_norm": 5.007558822631836, "learning_rate": 4.8109915898222895e-06, "loss": 2.0875, "step": 317 }, { "epoch": 0.1518624641833811, "grad_norm": 4.911679267883301, "learning_rate": 4.809513806036545e-06, "loss": 0.6788, "step": 318 }, { "epoch": 0.15234001910219674, "grad_norm": 4.876430988311768, "learning_rate": 4.808030496384477e-06, "loss": 1.4855, "step": 319 }, { "epoch": 0.15281757402101243, "grad_norm": 4.583290100097656, "learning_rate": 4.806541664415129e-06, "loss": 1.318, "step": 320 }, { "epoch": 0.15329512893982808, "grad_norm": 5.547001361846924, "learning_rate": 4.805047313690763e-06, "loss": 1.6089, "step": 321 }, { "epoch": 0.15377268385864373, "grad_norm": 4.420833110809326, "learning_rate": 4.803547447786841e-06, "loss": 1.7071, "step": 322 }, { "epoch": 0.15425023877745941, "grad_norm": 4.221208572387695, "learning_rate": 4.802042070292021e-06, "loss": 1.3435, "step": 323 }, { "epoch": 0.15472779369627507, "grad_norm": 4.689193248748779, "learning_rate": 4.800531184808153e-06, "loss": 1.8016, "step": 324 }, { "epoch": 0.15520534861509075, "grad_norm": 4.350506782531738, "learning_rate": 4.799014794950259e-06, "loss": 1.4736, "step": 325 }, { "epoch": 0.1556829035339064, "grad_norm": 4.509654998779297, "learning_rate": 4.797492904346539e-06, "loss": 1.9716, "step": 326 }, { "epoch": 0.15616045845272206, "grad_norm": 4.7990193367004395, "learning_rate": 4.795965516638345e-06, "loss": 1.9866, "step": 327 }, { "epoch": 0.15663801337153774, "grad_norm": 5.9454522132873535, "learning_rate": 4.79443263548019e-06, "loss": 1.5818, "step": 328 }, { "epoch": 0.1571155682903534, "grad_norm": 5.137877941131592, "learning_rate": 4.792894264539725e-06, "loss": 1.624, "step": 329 }, { "epoch": 0.15759312320916904, "grad_norm": 5.304074287414551, "learning_rate": 4.791350407497741e-06, "loss": 1.8022, "step": 330 }, { "epoch": 0.15807067812798473, "grad_norm": 4.727628231048584, "learning_rate": 4.7898010680481525e-06, "loss": 1.8407, "step": 331 }, { "epoch": 0.15854823304680038, "grad_norm": 4.630735397338867, "learning_rate": 4.78824624989799e-06, "loss": 1.7509, "step": 332 }, { "epoch": 0.15902578796561603, "grad_norm": 4.0985426902771, "learning_rate": 4.786685956767396e-06, "loss": 1.4845, "step": 333 }, { "epoch": 0.15950334288443171, "grad_norm": 3.663670539855957, "learning_rate": 4.785120192389612e-06, "loss": 0.7413, "step": 334 }, { "epoch": 0.15998089780324737, "grad_norm": 5.0337090492248535, "learning_rate": 4.783548960510969e-06, "loss": 1.1276, "step": 335 }, { "epoch": 0.16045845272206305, "grad_norm": 4.598258972167969, "learning_rate": 4.78197226489088e-06, "loss": 0.8642, "step": 336 }, { "epoch": 0.1609360076408787, "grad_norm": 4.276305675506592, "learning_rate": 4.780390109301832e-06, "loss": 1.7199, "step": 337 }, { "epoch": 0.16141356255969436, "grad_norm": 5.015296936035156, "learning_rate": 4.778802497529374e-06, "loss": 1.3087, "step": 338 }, { "epoch": 0.16189111747851004, "grad_norm": 5.043750286102295, "learning_rate": 4.777209433372113e-06, "loss": 1.2718, "step": 339 }, { "epoch": 0.1623686723973257, "grad_norm": 5.105163097381592, "learning_rate": 4.775610920641698e-06, "loss": 1.0274, "step": 340 }, { "epoch": 0.16284622731614135, "grad_norm": 5.624614715576172, "learning_rate": 4.774006963162817e-06, "loss": 1.2202, "step": 341 }, { "epoch": 0.16332378223495703, "grad_norm": 5.624722957611084, "learning_rate": 4.772397564773183e-06, "loss": 1.2355, "step": 342 }, { "epoch": 0.16380133715377268, "grad_norm": 4.657635688781738, "learning_rate": 4.77078272932353e-06, "loss": 1.7222, "step": 343 }, { "epoch": 0.16427889207258833, "grad_norm": 6.011509895324707, "learning_rate": 4.769162460677599e-06, "loss": 1.3314, "step": 344 }, { "epoch": 0.16475644699140402, "grad_norm": 6.080216884613037, "learning_rate": 4.767536762712133e-06, "loss": 1.0984, "step": 345 }, { "epoch": 0.16523400191021967, "grad_norm": 4.323025226593018, "learning_rate": 4.765905639316861e-06, "loss": 1.1952, "step": 346 }, { "epoch": 0.16571155682903535, "grad_norm": 5.289118766784668, "learning_rate": 4.764269094394499e-06, "loss": 1.7561, "step": 347 }, { "epoch": 0.166189111747851, "grad_norm": 5.633922100067139, "learning_rate": 4.7626271318607305e-06, "loss": 1.2226, "step": 348 }, { "epoch": 0.16666666666666666, "grad_norm": 5.287115097045898, "learning_rate": 4.760979755644204e-06, "loss": 2.1358, "step": 349 }, { "epoch": 0.16714422158548234, "grad_norm": 5.55139684677124, "learning_rate": 4.759326969686519e-06, "loss": 1.5378, "step": 350 }, { "epoch": 0.167621776504298, "grad_norm": 5.4806599617004395, "learning_rate": 4.757668777942219e-06, "loss": 1.0795, "step": 351 }, { "epoch": 0.16809933142311365, "grad_norm": 4.47760009765625, "learning_rate": 4.756005184378784e-06, "loss": 1.4699, "step": 352 }, { "epoch": 0.16857688634192933, "grad_norm": 4.296785354614258, "learning_rate": 4.754336192976617e-06, "loss": 1.0125, "step": 353 }, { "epoch": 0.16905444126074498, "grad_norm": 4.933202743530273, "learning_rate": 4.752661807729033e-06, "loss": 0.681, "step": 354 }, { "epoch": 0.16953199617956066, "grad_norm": 4.773186683654785, "learning_rate": 4.75098203264226e-06, "loss": 0.7631, "step": 355 }, { "epoch": 0.17000955109837632, "grad_norm": 5.127095699310303, "learning_rate": 4.749296871735416e-06, "loss": 1.149, "step": 356 }, { "epoch": 0.17048710601719197, "grad_norm": 4.723343372344971, "learning_rate": 4.7476063290405064e-06, "loss": 2.2513, "step": 357 }, { "epoch": 0.17096466093600765, "grad_norm": 5.298307418823242, "learning_rate": 4.745910408602414e-06, "loss": 1.5124, "step": 358 }, { "epoch": 0.1714422158548233, "grad_norm": 4.7738494873046875, "learning_rate": 4.7442091144788905e-06, "loss": 1.8133, "step": 359 }, { "epoch": 0.17191977077363896, "grad_norm": 4.859167575836182, "learning_rate": 4.742502450740542e-06, "loss": 1.6231, "step": 360 }, { "epoch": 0.17239732569245464, "grad_norm": 5.3526611328125, "learning_rate": 4.7407904214708236e-06, "loss": 1.8892, "step": 361 }, { "epoch": 0.1728748806112703, "grad_norm": 5.2197957038879395, "learning_rate": 4.7390730307660285e-06, "loss": 1.2386, "step": 362 }, { "epoch": 0.17335243553008595, "grad_norm": 5.169437408447266, "learning_rate": 4.7373502827352776e-06, "loss": 1.6236, "step": 363 }, { "epoch": 0.17382999044890163, "grad_norm": 7.183287620544434, "learning_rate": 4.73562218150051e-06, "loss": 1.1296, "step": 364 }, { "epoch": 0.17430754536771728, "grad_norm": 7.340289115905762, "learning_rate": 4.733888731196473e-06, "loss": 1.422, "step": 365 }, { "epoch": 0.17478510028653296, "grad_norm": 3.734912633895874, "learning_rate": 4.732149935970712e-06, "loss": 0.7965, "step": 366 }, { "epoch": 0.17526265520534862, "grad_norm": 4.806953430175781, "learning_rate": 4.730405799983564e-06, "loss": 1.9703, "step": 367 }, { "epoch": 0.17574021012416427, "grad_norm": 5.322519302368164, "learning_rate": 4.728656327408138e-06, "loss": 1.4472, "step": 368 }, { "epoch": 0.17621776504297995, "grad_norm": 4.3621296882629395, "learning_rate": 4.72690152243032e-06, "loss": 1.2409, "step": 369 }, { "epoch": 0.1766953199617956, "grad_norm": 5.016175270080566, "learning_rate": 4.7251413892487465e-06, "loss": 0.8901, "step": 370 }, { "epoch": 0.17717287488061126, "grad_norm": 4.521709442138672, "learning_rate": 4.723375932074808e-06, "loss": 1.7086, "step": 371 }, { "epoch": 0.17765042979942694, "grad_norm": 4.213695049285889, "learning_rate": 4.721605155132633e-06, "loss": 0.8491, "step": 372 }, { "epoch": 0.1781279847182426, "grad_norm": 4.741628170013428, "learning_rate": 4.719829062659076e-06, "loss": 0.9496, "step": 373 }, { "epoch": 0.17860553963705827, "grad_norm": 4.060512065887451, "learning_rate": 4.718047658903711e-06, "loss": 1.2774, "step": 374 }, { "epoch": 0.17908309455587393, "grad_norm": 5.11362886428833, "learning_rate": 4.716260948128819e-06, "loss": 2.0515, "step": 375 }, { "epoch": 0.17956064947468958, "grad_norm": 4.952921390533447, "learning_rate": 4.7144689346093814e-06, "loss": 1.2925, "step": 376 }, { "epoch": 0.18003820439350526, "grad_norm": 4.549046993255615, "learning_rate": 4.712671622633065e-06, "loss": 1.7701, "step": 377 }, { "epoch": 0.18051575931232092, "grad_norm": 5.009278774261475, "learning_rate": 4.710869016500215e-06, "loss": 1.8473, "step": 378 }, { "epoch": 0.18099331423113657, "grad_norm": 4.4082818031311035, "learning_rate": 4.7090611205238415e-06, "loss": 1.6738, "step": 379 }, { "epoch": 0.18147086914995225, "grad_norm": 4.200804233551025, "learning_rate": 4.707247939029616e-06, "loss": 1.5206, "step": 380 }, { "epoch": 0.1819484240687679, "grad_norm": 4.698638916015625, "learning_rate": 4.705429476355852e-06, "loss": 1.0018, "step": 381 }, { "epoch": 0.18242597898758356, "grad_norm": 5.005064010620117, "learning_rate": 4.7036057368535e-06, "loss": 2.0094, "step": 382 }, { "epoch": 0.18290353390639924, "grad_norm": 4.6862101554870605, "learning_rate": 4.701776724886137e-06, "loss": 1.4615, "step": 383 }, { "epoch": 0.1833810888252149, "grad_norm": 5.007786750793457, "learning_rate": 4.699942444829957e-06, "loss": 1.3385, "step": 384 }, { "epoch": 0.18385864374403058, "grad_norm": 5.928104877471924, "learning_rate": 4.698102901073754e-06, "loss": 1.7282, "step": 385 }, { "epoch": 0.18433619866284623, "grad_norm": 4.620011329650879, "learning_rate": 4.69625809801892e-06, "loss": 2.0731, "step": 386 }, { "epoch": 0.18481375358166188, "grad_norm": 4.970291614532471, "learning_rate": 4.6944080400794295e-06, "loss": 1.0551, "step": 387 }, { "epoch": 0.18529130850047756, "grad_norm": 4.269640922546387, "learning_rate": 4.69255273168183e-06, "loss": 1.0265, "step": 388 }, { "epoch": 0.18576886341929322, "grad_norm": 5.445036888122559, "learning_rate": 4.690692177265231e-06, "loss": 1.718, "step": 389 }, { "epoch": 0.18624641833810887, "grad_norm": 4.073181629180908, "learning_rate": 4.688826381281296e-06, "loss": 2.3184, "step": 390 }, { "epoch": 0.18672397325692455, "grad_norm": 4.650896072387695, "learning_rate": 4.686955348194228e-06, "loss": 1.5211, "step": 391 }, { "epoch": 0.1872015281757402, "grad_norm": 5.641841411590576, "learning_rate": 4.685079082480759e-06, "loss": 1.159, "step": 392 }, { "epoch": 0.1876790830945559, "grad_norm": 4.450399398803711, "learning_rate": 4.683197588630145e-06, "loss": 1.4988, "step": 393 }, { "epoch": 0.18815663801337154, "grad_norm": 5.5170745849609375, "learning_rate": 4.681310871144148e-06, "loss": 1.5496, "step": 394 }, { "epoch": 0.1886341929321872, "grad_norm": 4.631316184997559, "learning_rate": 4.679418934537029e-06, "loss": 1.3353, "step": 395 }, { "epoch": 0.18911174785100288, "grad_norm": 4.32656717300415, "learning_rate": 4.6775217833355365e-06, "loss": 0.7785, "step": 396 }, { "epoch": 0.18958930276981853, "grad_norm": 4.661660194396973, "learning_rate": 4.675619422078898e-06, "loss": 1.8322, "step": 397 }, { "epoch": 0.19006685768863418, "grad_norm": 5.94793176651001, "learning_rate": 4.673711855318802e-06, "loss": 1.6942, "step": 398 }, { "epoch": 0.19054441260744986, "grad_norm": 5.030279159545898, "learning_rate": 4.6717990876193976e-06, "loss": 0.7493, "step": 399 }, { "epoch": 0.19102196752626552, "grad_norm": 4.9419732093811035, "learning_rate": 4.6698811235572725e-06, "loss": 1.7589, "step": 400 }, { "epoch": 0.19149952244508117, "grad_norm": 4.309925556182861, "learning_rate": 4.6679579677214515e-06, "loss": 1.4931, "step": 401 }, { "epoch": 0.19197707736389685, "grad_norm": 4.897881507873535, "learning_rate": 4.666029624713381e-06, "loss": 1.2034, "step": 402 }, { "epoch": 0.1924546322827125, "grad_norm": 4.902161121368408, "learning_rate": 4.664096099146916e-06, "loss": 1.2468, "step": 403 }, { "epoch": 0.1929321872015282, "grad_norm": 4.410922050476074, "learning_rate": 4.662157395648314e-06, "loss": 1.3237, "step": 404 }, { "epoch": 0.19340974212034384, "grad_norm": 4.3372578620910645, "learning_rate": 4.660213518856221e-06, "loss": 1.5948, "step": 405 }, { "epoch": 0.1938872970391595, "grad_norm": 4.7953619956970215, "learning_rate": 4.65826447342166e-06, "loss": 1.4866, "step": 406 }, { "epoch": 0.19436485195797518, "grad_norm": 4.33857536315918, "learning_rate": 4.656310264008021e-06, "loss": 1.4119, "step": 407 }, { "epoch": 0.19484240687679083, "grad_norm": 5.194427490234375, "learning_rate": 4.654350895291053e-06, "loss": 1.1768, "step": 408 }, { "epoch": 0.19531996179560648, "grad_norm": 5.650949001312256, "learning_rate": 4.6523863719588425e-06, "loss": 1.4667, "step": 409 }, { "epoch": 0.19579751671442217, "grad_norm": 5.04359769821167, "learning_rate": 4.650416698711816e-06, "loss": 1.5404, "step": 410 }, { "epoch": 0.19627507163323782, "grad_norm": 5.359978199005127, "learning_rate": 4.648441880262717e-06, "loss": 1.4588, "step": 411 }, { "epoch": 0.19675262655205347, "grad_norm": 4.7736897468566895, "learning_rate": 4.646461921336604e-06, "loss": 2.317, "step": 412 }, { "epoch": 0.19723018147086915, "grad_norm": 4.505736827850342, "learning_rate": 4.64447682667083e-06, "loss": 1.4218, "step": 413 }, { "epoch": 0.1977077363896848, "grad_norm": 5.84902811050415, "learning_rate": 4.642486601015041e-06, "loss": 1.1599, "step": 414 }, { "epoch": 0.1981852913085005, "grad_norm": 5.970823764801025, "learning_rate": 4.640491249131157e-06, "loss": 1.5572, "step": 415 }, { "epoch": 0.19866284622731614, "grad_norm": 4.635551452636719, "learning_rate": 4.638490775793362e-06, "loss": 1.2559, "step": 416 }, { "epoch": 0.1991404011461318, "grad_norm": 4.661638259887695, "learning_rate": 4.636485185788098e-06, "loss": 0.5406, "step": 417 }, { "epoch": 0.19961795606494748, "grad_norm": 4.560032844543457, "learning_rate": 4.634474483914045e-06, "loss": 1.7163, "step": 418 }, { "epoch": 0.20009551098376313, "grad_norm": 3.775115489959717, "learning_rate": 4.6324586749821185e-06, "loss": 0.8761, "step": 419 }, { "epoch": 0.20057306590257878, "grad_norm": 5.198001384735107, "learning_rate": 4.6304377638154476e-06, "loss": 1.5965, "step": 420 }, { "epoch": 0.20105062082139447, "grad_norm": 4.855744361877441, "learning_rate": 4.628411755249375e-06, "loss": 1.9078, "step": 421 }, { "epoch": 0.20152817574021012, "grad_norm": 4.622410774230957, "learning_rate": 4.626380654131435e-06, "loss": 1.2734, "step": 422 }, { "epoch": 0.2020057306590258, "grad_norm": 4.6625895500183105, "learning_rate": 4.624344465321351e-06, "loss": 1.2624, "step": 423 }, { "epoch": 0.20248328557784145, "grad_norm": 4.361771106719971, "learning_rate": 4.622303193691016e-06, "loss": 1.6475, "step": 424 }, { "epoch": 0.2029608404966571, "grad_norm": 7.085798740386963, "learning_rate": 4.620256844124485e-06, "loss": 1.4236, "step": 425 }, { "epoch": 0.2034383954154728, "grad_norm": 4.33780574798584, "learning_rate": 4.618205421517966e-06, "loss": 1.3427, "step": 426 }, { "epoch": 0.20391595033428844, "grad_norm": 4.990318298339844, "learning_rate": 4.616148930779801e-06, "loss": 1.4888, "step": 427 }, { "epoch": 0.2043935052531041, "grad_norm": 4.282358169555664, "learning_rate": 4.6140873768304594e-06, "loss": 1.0747, "step": 428 }, { "epoch": 0.20487106017191978, "grad_norm": 5.1923418045043945, "learning_rate": 4.612020764602526e-06, "loss": 1.6158, "step": 429 }, { "epoch": 0.20534861509073543, "grad_norm": 6.003612995147705, "learning_rate": 4.609949099040689e-06, "loss": 0.8757, "step": 430 }, { "epoch": 0.20582617000955108, "grad_norm": 4.16102933883667, "learning_rate": 4.607872385101724e-06, "loss": 1.2433, "step": 431 }, { "epoch": 0.20630372492836677, "grad_norm": 4.663821697235107, "learning_rate": 4.605790627754489e-06, "loss": 2.0617, "step": 432 }, { "epoch": 0.20678127984718242, "grad_norm": 4.523113250732422, "learning_rate": 4.603703831979908e-06, "loss": 1.2782, "step": 433 }, { "epoch": 0.2072588347659981, "grad_norm": 4.578159809112549, "learning_rate": 4.60161200277096e-06, "loss": 1.4782, "step": 434 }, { "epoch": 0.20773638968481375, "grad_norm": 5.282318592071533, "learning_rate": 4.599515145132669e-06, "loss": 1.7276, "step": 435 }, { "epoch": 0.2082139446036294, "grad_norm": 4.107289791107178, "learning_rate": 4.597413264082086e-06, "loss": 1.4684, "step": 436 }, { "epoch": 0.2086914995224451, "grad_norm": 4.687378406524658, "learning_rate": 4.5953063646482865e-06, "loss": 1.0602, "step": 437 }, { "epoch": 0.20916905444126074, "grad_norm": 5.044548511505127, "learning_rate": 4.59319445187235e-06, "loss": 1.6161, "step": 438 }, { "epoch": 0.2096466093600764, "grad_norm": 5.004764080047607, "learning_rate": 4.5910775308073516e-06, "loss": 1.7818, "step": 439 }, { "epoch": 0.21012416427889208, "grad_norm": 4.996269702911377, "learning_rate": 4.588955606518351e-06, "loss": 1.4953, "step": 440 }, { "epoch": 0.21060171919770773, "grad_norm": 4.899430274963379, "learning_rate": 4.586828684082376e-06, "loss": 0.7372, "step": 441 }, { "epoch": 0.2110792741165234, "grad_norm": 4.160622596740723, "learning_rate": 4.584696768588417e-06, "loss": 1.7606, "step": 442 }, { "epoch": 0.21155682903533907, "grad_norm": 6.715820789337158, "learning_rate": 4.582559865137408e-06, "loss": 1.0241, "step": 443 }, { "epoch": 0.21203438395415472, "grad_norm": 4.947917938232422, "learning_rate": 4.58041797884222e-06, "loss": 1.7557, "step": 444 }, { "epoch": 0.2125119388729704, "grad_norm": 4.9781012535095215, "learning_rate": 4.578271114827642e-06, "loss": 1.2713, "step": 445 }, { "epoch": 0.21298949379178606, "grad_norm": 4.813570976257324, "learning_rate": 4.5761192782303785e-06, "loss": 2.0862, "step": 446 }, { "epoch": 0.2134670487106017, "grad_norm": 3.8827295303344727, "learning_rate": 4.573962474199027e-06, "loss": 1.6747, "step": 447 }, { "epoch": 0.2139446036294174, "grad_norm": 6.001773357391357, "learning_rate": 4.571800707894073e-06, "loss": 1.7237, "step": 448 }, { "epoch": 0.21442215854823304, "grad_norm": 4.159977436065674, "learning_rate": 4.569633984487875e-06, "loss": 1.6951, "step": 449 }, { "epoch": 0.2148997134670487, "grad_norm": 5.463620185852051, "learning_rate": 4.56746230916465e-06, "loss": 1.4547, "step": 450 }, { "epoch": 0.21537726838586438, "grad_norm": 5.150775909423828, "learning_rate": 4.565285687120465e-06, "loss": 1.1363, "step": 451 }, { "epoch": 0.21585482330468003, "grad_norm": 4.080126762390137, "learning_rate": 4.563104123563222e-06, "loss": 1.7833, "step": 452 }, { "epoch": 0.2163323782234957, "grad_norm": 5.102267265319824, "learning_rate": 4.5609176237126485e-06, "loss": 1.6243, "step": 453 }, { "epoch": 0.21680993314231137, "grad_norm": 6.128914833068848, "learning_rate": 4.558726192800279e-06, "loss": 1.8794, "step": 454 }, { "epoch": 0.21728748806112702, "grad_norm": 3.792633056640625, "learning_rate": 4.55652983606945e-06, "loss": 0.9253, "step": 455 }, { "epoch": 0.2177650429799427, "grad_norm": 3.9588589668273926, "learning_rate": 4.55432855877528e-06, "loss": 1.7611, "step": 456 }, { "epoch": 0.21824259789875836, "grad_norm": 4.997145175933838, "learning_rate": 4.552122366184666e-06, "loss": 2.0229, "step": 457 }, { "epoch": 0.218720152817574, "grad_norm": 4.174449443817139, "learning_rate": 4.5499112635762595e-06, "loss": 1.7734, "step": 458 }, { "epoch": 0.2191977077363897, "grad_norm": 5.021090030670166, "learning_rate": 4.5476952562404644e-06, "loss": 1.1542, "step": 459 }, { "epoch": 0.21967526265520534, "grad_norm": 5.457566738128662, "learning_rate": 4.545474349479418e-06, "loss": 1.3331, "step": 460 }, { "epoch": 0.22015281757402103, "grad_norm": 4.947608470916748, "learning_rate": 4.543248548606981e-06, "loss": 1.1202, "step": 461 }, { "epoch": 0.22063037249283668, "grad_norm": 4.682175159454346, "learning_rate": 4.541017858948724e-06, "loss": 1.6182, "step": 462 }, { "epoch": 0.22110792741165233, "grad_norm": 5.113767147064209, "learning_rate": 4.538782285841913e-06, "loss": 1.4336, "step": 463 }, { "epoch": 0.22158548233046801, "grad_norm": 4.465206623077393, "learning_rate": 4.536541834635501e-06, "loss": 0.9435, "step": 464 }, { "epoch": 0.22206303724928367, "grad_norm": 6.104964733123779, "learning_rate": 4.534296510690112e-06, "loss": 2.0209, "step": 465 }, { "epoch": 0.22254059216809932, "grad_norm": 4.474550724029541, "learning_rate": 4.5320463193780265e-06, "loss": 1.332, "step": 466 }, { "epoch": 0.223018147086915, "grad_norm": 5.112949848175049, "learning_rate": 4.529791266083174e-06, "loss": 1.4676, "step": 467 }, { "epoch": 0.22349570200573066, "grad_norm": 6.080946922302246, "learning_rate": 4.527531356201115e-06, "loss": 1.9896, "step": 468 }, { "epoch": 0.2239732569245463, "grad_norm": 5.364470958709717, "learning_rate": 4.525266595139032e-06, "loss": 1.6175, "step": 469 }, { "epoch": 0.224450811843362, "grad_norm": 6.41459321975708, "learning_rate": 4.5229969883157114e-06, "loss": 1.4147, "step": 470 }, { "epoch": 0.22492836676217765, "grad_norm": 5.004256248474121, "learning_rate": 4.520722541161538e-06, "loss": 1.0558, "step": 471 }, { "epoch": 0.22540592168099333, "grad_norm": 4.485739707946777, "learning_rate": 4.518443259118475e-06, "loss": 1.6195, "step": 472 }, { "epoch": 0.22588347659980898, "grad_norm": 4.292774677276611, "learning_rate": 4.516159147640054e-06, "loss": 1.5761, "step": 473 }, { "epoch": 0.22636103151862463, "grad_norm": 4.884871006011963, "learning_rate": 4.513870212191363e-06, "loss": 1.2361, "step": 474 }, { "epoch": 0.22683858643744031, "grad_norm": 5.770694255828857, "learning_rate": 4.511576458249032e-06, "loss": 1.3871, "step": 475 }, { "epoch": 0.22731614135625597, "grad_norm": 5.170015811920166, "learning_rate": 4.509277891301218e-06, "loss": 1.4757, "step": 476 }, { "epoch": 0.22779369627507162, "grad_norm": 5.186672210693359, "learning_rate": 4.506974516847597e-06, "loss": 1.1801, "step": 477 }, { "epoch": 0.2282712511938873, "grad_norm": 4.439860820770264, "learning_rate": 4.504666340399344e-06, "loss": 2.0473, "step": 478 }, { "epoch": 0.22874880611270296, "grad_norm": 4.5657734870910645, "learning_rate": 4.502353367479128e-06, "loss": 1.3988, "step": 479 }, { "epoch": 0.22922636103151864, "grad_norm": 4.2553510665893555, "learning_rate": 4.500035603621089e-06, "loss": 1.7084, "step": 480 }, { "epoch": 0.2297039159503343, "grad_norm": 4.95721960067749, "learning_rate": 4.497713054370835e-06, "loss": 1.5628, "step": 481 }, { "epoch": 0.23018147086914995, "grad_norm": 4.211477279663086, "learning_rate": 4.49538572528542e-06, "loss": 1.8298, "step": 482 }, { "epoch": 0.23065902578796563, "grad_norm": 4.0244669914245605, "learning_rate": 4.493053621933337e-06, "loss": 1.912, "step": 483 }, { "epoch": 0.23113658070678128, "grad_norm": 5.841868877410889, "learning_rate": 4.490716749894501e-06, "loss": 1.7796, "step": 484 }, { "epoch": 0.23161413562559693, "grad_norm": 5.960061073303223, "learning_rate": 4.4883751147602386e-06, "loss": 1.2103, "step": 485 }, { "epoch": 0.23209169054441262, "grad_norm": 6.238626003265381, "learning_rate": 4.4860287221332685e-06, "loss": 2.1714, "step": 486 }, { "epoch": 0.23256924546322827, "grad_norm": 3.9681949615478516, "learning_rate": 4.483677577627697e-06, "loss": 1.0857, "step": 487 }, { "epoch": 0.23304680038204392, "grad_norm": 3.6139049530029297, "learning_rate": 4.481321686868999e-06, "loss": 0.6593, "step": 488 }, { "epoch": 0.2335243553008596, "grad_norm": 4.41166353225708, "learning_rate": 4.478961055494004e-06, "loss": 1.0911, "step": 489 }, { "epoch": 0.23400191021967526, "grad_norm": 4.839325904846191, "learning_rate": 4.476595689150884e-06, "loss": 0.8517, "step": 490 }, { "epoch": 0.23447946513849094, "grad_norm": 4.8198137283325195, "learning_rate": 4.4742255934991415e-06, "loss": 1.5951, "step": 491 }, { "epoch": 0.2349570200573066, "grad_norm": 4.557744979858398, "learning_rate": 4.471850774209595e-06, "loss": 1.5944, "step": 492 }, { "epoch": 0.23543457497612225, "grad_norm": 4.46013879776001, "learning_rate": 4.469471236964364e-06, "loss": 1.3491, "step": 493 }, { "epoch": 0.23591212989493793, "grad_norm": 4.639164924621582, "learning_rate": 4.467086987456857e-06, "loss": 0.9374, "step": 494 }, { "epoch": 0.23638968481375358, "grad_norm": 4.1202006340026855, "learning_rate": 4.464698031391755e-06, "loss": 1.5728, "step": 495 }, { "epoch": 0.23686723973256923, "grad_norm": 5.527141094207764, "learning_rate": 4.462304374485005e-06, "loss": 1.6628, "step": 496 }, { "epoch": 0.23734479465138492, "grad_norm": 4.528934478759766, "learning_rate": 4.459906022463797e-06, "loss": 1.4275, "step": 497 }, { "epoch": 0.23782234957020057, "grad_norm": 5.29315185546875, "learning_rate": 4.457502981066556e-06, "loss": 1.755, "step": 498 }, { "epoch": 0.23829990448901622, "grad_norm": 4.723245620727539, "learning_rate": 4.455095256042928e-06, "loss": 1.6335, "step": 499 }, { "epoch": 0.2387774594078319, "grad_norm": 4.290173530578613, "learning_rate": 4.4526828531537645e-06, "loss": 1.5775, "step": 500 }, { "epoch": 0.23925501432664756, "grad_norm": 4.403853893280029, "learning_rate": 4.4502657781711105e-06, "loss": 1.5101, "step": 501 }, { "epoch": 0.23973256924546324, "grad_norm": 4.483979225158691, "learning_rate": 4.447844036878189e-06, "loss": 1.2525, "step": 502 }, { "epoch": 0.2402101241642789, "grad_norm": 4.204350471496582, "learning_rate": 4.445417635069386e-06, "loss": 0.7755, "step": 503 }, { "epoch": 0.24068767908309455, "grad_norm": 4.5659050941467285, "learning_rate": 4.442986578550242e-06, "loss": 1.2653, "step": 504 }, { "epoch": 0.24116523400191023, "grad_norm": 4.001880645751953, "learning_rate": 4.440550873137432e-06, "loss": 1.1363, "step": 505 }, { "epoch": 0.24164278892072588, "grad_norm": 4.495390892028809, "learning_rate": 4.4381105246587566e-06, "loss": 1.8865, "step": 506 }, { "epoch": 0.24212034383954154, "grad_norm": 5.8478312492370605, "learning_rate": 4.435665538953124e-06, "loss": 1.2569, "step": 507 }, { "epoch": 0.24259789875835722, "grad_norm": 5.197742938995361, "learning_rate": 4.433215921870535e-06, "loss": 1.1044, "step": 508 }, { "epoch": 0.24307545367717287, "grad_norm": 5.625690460205078, "learning_rate": 4.430761679272078e-06, "loss": 1.3434, "step": 509 }, { "epoch": 0.24355300859598855, "grad_norm": 4.15615177154541, "learning_rate": 4.428302817029903e-06, "loss": 0.9501, "step": 510 }, { "epoch": 0.2440305635148042, "grad_norm": 4.598491668701172, "learning_rate": 4.4258393410272155e-06, "loss": 2.0542, "step": 511 }, { "epoch": 0.24450811843361986, "grad_norm": 5.3769121170043945, "learning_rate": 4.423371257158261e-06, "loss": 1.4093, "step": 512 }, { "epoch": 0.24498567335243554, "grad_norm": 4.566112995147705, "learning_rate": 4.4208985713283094e-06, "loss": 1.7327, "step": 513 }, { "epoch": 0.2454632282712512, "grad_norm": 5.779961585998535, "learning_rate": 4.418421289453639e-06, "loss": 1.797, "step": 514 }, { "epoch": 0.24594078319006685, "grad_norm": 5.457843780517578, "learning_rate": 4.41593941746153e-06, "loss": 1.8012, "step": 515 }, { "epoch": 0.24641833810888253, "grad_norm": 5.049428939819336, "learning_rate": 4.41345296129024e-06, "loss": 1.8024, "step": 516 }, { "epoch": 0.24689589302769818, "grad_norm": 5.107661724090576, "learning_rate": 4.4109619268889974e-06, "loss": 1.4099, "step": 517 }, { "epoch": 0.24737344794651384, "grad_norm": 4.613182067871094, "learning_rate": 4.408466320217985e-06, "loss": 1.7839, "step": 518 }, { "epoch": 0.24785100286532952, "grad_norm": 3.994835615158081, "learning_rate": 4.405966147248324e-06, "loss": 1.1649, "step": 519 }, { "epoch": 0.24832855778414517, "grad_norm": 4.256359100341797, "learning_rate": 4.4034614139620625e-06, "loss": 1.5013, "step": 520 }, { "epoch": 0.24880611270296085, "grad_norm": 4.785167217254639, "learning_rate": 4.400952126352161e-06, "loss": 1.7228, "step": 521 }, { "epoch": 0.2492836676217765, "grad_norm": 4.746972560882568, "learning_rate": 4.398438290422471e-06, "loss": 1.5386, "step": 522 }, { "epoch": 0.24976122254059216, "grad_norm": 5.167286396026611, "learning_rate": 4.395919912187736e-06, "loss": 1.051, "step": 523 }, { "epoch": 0.2502387774594078, "grad_norm": 4.453719615936279, "learning_rate": 4.393396997673559e-06, "loss": 1.733, "step": 524 }, { "epoch": 0.2507163323782235, "grad_norm": 4.2139692306518555, "learning_rate": 4.390869552916401e-06, "loss": 1.3489, "step": 525 }, { "epoch": 0.2511938872970392, "grad_norm": 4.080151081085205, "learning_rate": 4.388337583963563e-06, "loss": 1.5904, "step": 526 }, { "epoch": 0.2516714422158548, "grad_norm": 4.93978214263916, "learning_rate": 4.385801096873169e-06, "loss": 0.9623, "step": 527 }, { "epoch": 0.2521489971346705, "grad_norm": 4.547937870025635, "learning_rate": 4.3832600977141526e-06, "loss": 1.1863, "step": 528 }, { "epoch": 0.25262655205348616, "grad_norm": 4.092640399932861, "learning_rate": 4.380714592566244e-06, "loss": 1.7355, "step": 529 }, { "epoch": 0.2531041069723018, "grad_norm": 5.235072612762451, "learning_rate": 4.378164587519957e-06, "loss": 0.9776, "step": 530 }, { "epoch": 0.25358166189111747, "grad_norm": 4.06362771987915, "learning_rate": 4.3756100886765695e-06, "loss": 1.9436, "step": 531 }, { "epoch": 0.25405921680993315, "grad_norm": 4.636857032775879, "learning_rate": 4.373051102148112e-06, "loss": 1.3381, "step": 532 }, { "epoch": 0.2545367717287488, "grad_norm": 4.209655284881592, "learning_rate": 4.370487634057351e-06, "loss": 1.3376, "step": 533 }, { "epoch": 0.25501432664756446, "grad_norm": 5.265044212341309, "learning_rate": 4.367919690537779e-06, "loss": 0.9465, "step": 534 }, { "epoch": 0.25549188156638014, "grad_norm": 3.9268596172332764, "learning_rate": 4.3653472777335956e-06, "loss": 0.7504, "step": 535 }, { "epoch": 0.2559694364851958, "grad_norm": 4.19022798538208, "learning_rate": 4.362770401799692e-06, "loss": 1.7973, "step": 536 }, { "epoch": 0.25644699140401145, "grad_norm": 5.051555156707764, "learning_rate": 4.360189068901641e-06, "loss": 1.1785, "step": 537 }, { "epoch": 0.25692454632282713, "grad_norm": 5.484776496887207, "learning_rate": 4.357603285215676e-06, "loss": 1.4054, "step": 538 }, { "epoch": 0.2574021012416428, "grad_norm": 4.657289505004883, "learning_rate": 4.355013056928683e-06, "loss": 0.8365, "step": 539 }, { "epoch": 0.25787965616045844, "grad_norm": 4.823512554168701, "learning_rate": 4.35241839023818e-06, "loss": 0.9621, "step": 540 }, { "epoch": 0.2583572110792741, "grad_norm": 3.3369834423065186, "learning_rate": 4.349819291352306e-06, "loss": 0.7619, "step": 541 }, { "epoch": 0.2588347659980898, "grad_norm": 4.736769199371338, "learning_rate": 4.347215766489803e-06, "loss": 1.6263, "step": 542 }, { "epoch": 0.2593123209169054, "grad_norm": 5.503939151763916, "learning_rate": 4.344607821880005e-06, "loss": 1.075, "step": 543 }, { "epoch": 0.2597898758357211, "grad_norm": 4.896664619445801, "learning_rate": 4.341995463762819e-06, "loss": 1.8696, "step": 544 }, { "epoch": 0.2602674307545368, "grad_norm": 4.481847286224365, "learning_rate": 4.3393786983887135e-06, "loss": 1.2624, "step": 545 }, { "epoch": 0.2607449856733524, "grad_norm": 4.51008415222168, "learning_rate": 4.3367575320187e-06, "loss": 1.6278, "step": 546 }, { "epoch": 0.2612225405921681, "grad_norm": 4.519957542419434, "learning_rate": 4.334131970924323e-06, "loss": 1.0916, "step": 547 }, { "epoch": 0.2617000955109838, "grad_norm": 5.215567111968994, "learning_rate": 4.331502021387642e-06, "loss": 1.374, "step": 548 }, { "epoch": 0.2621776504297994, "grad_norm": 5.7838945388793945, "learning_rate": 4.3288676897012105e-06, "loss": 1.4061, "step": 549 }, { "epoch": 0.2626552053486151, "grad_norm": 4.664610862731934, "learning_rate": 4.326228982168075e-06, "loss": 0.8338, "step": 550 }, { "epoch": 0.26313276026743077, "grad_norm": 5.64583158493042, "learning_rate": 4.323585905101747e-06, "loss": 1.612, "step": 551 }, { "epoch": 0.2636103151862464, "grad_norm": 4.001967906951904, "learning_rate": 4.3209384648261944e-06, "loss": 1.0746, "step": 552 }, { "epoch": 0.2640878701050621, "grad_norm": 5.485511302947998, "learning_rate": 4.3182866676758245e-06, "loss": 1.4762, "step": 553 }, { "epoch": 0.26456542502387775, "grad_norm": 4.606144428253174, "learning_rate": 4.31563051999547e-06, "loss": 1.2032, "step": 554 }, { "epoch": 0.26504297994269344, "grad_norm": 4.962388515472412, "learning_rate": 4.31297002814037e-06, "loss": 1.6631, "step": 555 }, { "epoch": 0.26552053486150906, "grad_norm": 5.0145087242126465, "learning_rate": 4.310305198476161e-06, "loss": 1.1835, "step": 556 }, { "epoch": 0.26599808978032474, "grad_norm": 4.142506122589111, "learning_rate": 4.307636037378856e-06, "loss": 1.1531, "step": 557 }, { "epoch": 0.2664756446991404, "grad_norm": 4.502025127410889, "learning_rate": 4.3049625512348345e-06, "loss": 2.213, "step": 558 }, { "epoch": 0.26695319961795605, "grad_norm": 4.471866607666016, "learning_rate": 4.302284746440822e-06, "loss": 1.9382, "step": 559 }, { "epoch": 0.26743075453677173, "grad_norm": 4.7635369300842285, "learning_rate": 4.299602629403876e-06, "loss": 1.8954, "step": 560 }, { "epoch": 0.2679083094555874, "grad_norm": 5.479198932647705, "learning_rate": 4.296916206541375e-06, "loss": 1.9314, "step": 561 }, { "epoch": 0.26838586437440304, "grad_norm": 4.713190078735352, "learning_rate": 4.294225484280997e-06, "loss": 1.1554, "step": 562 }, { "epoch": 0.2688634192932187, "grad_norm": 4.164124011993408, "learning_rate": 4.291530469060709e-06, "loss": 1.706, "step": 563 }, { "epoch": 0.2693409742120344, "grad_norm": 4.41004753112793, "learning_rate": 4.288831167328748e-06, "loss": 1.3184, "step": 564 }, { "epoch": 0.26981852913085, "grad_norm": 4.714593887329102, "learning_rate": 4.286127585543608e-06, "loss": 1.4226, "step": 565 }, { "epoch": 0.2702960840496657, "grad_norm": 5.1301655769348145, "learning_rate": 4.2834197301740235e-06, "loss": 1.6751, "step": 566 }, { "epoch": 0.2707736389684814, "grad_norm": 4.90928840637207, "learning_rate": 4.280707607698953e-06, "loss": 1.4105, "step": 567 }, { "epoch": 0.271251193887297, "grad_norm": 5.121311664581299, "learning_rate": 4.277991224607567e-06, "loss": 1.4721, "step": 568 }, { "epoch": 0.2717287488061127, "grad_norm": 5.332688331604004, "learning_rate": 4.275270587399231e-06, "loss": 1.0135, "step": 569 }, { "epoch": 0.2722063037249284, "grad_norm": 3.487208127975464, "learning_rate": 4.272545702583483e-06, "loss": 0.5538, "step": 570 }, { "epoch": 0.272683858643744, "grad_norm": 4.778096675872803, "learning_rate": 4.269816576680031e-06, "loss": 1.5993, "step": 571 }, { "epoch": 0.2731614135625597, "grad_norm": 5.153265953063965, "learning_rate": 4.267083216218727e-06, "loss": 1.2924, "step": 572 }, { "epoch": 0.27363896848137537, "grad_norm": 5.50324821472168, "learning_rate": 4.264345627739556e-06, "loss": 0.9932, "step": 573 }, { "epoch": 0.27411652340019105, "grad_norm": 4.613612174987793, "learning_rate": 4.261603817792618e-06, "loss": 1.473, "step": 574 }, { "epoch": 0.2745940783190067, "grad_norm": 4.653387069702148, "learning_rate": 4.258857792938114e-06, "loss": 2.0519, "step": 575 }, { "epoch": 0.27507163323782235, "grad_norm": 4.143726348876953, "learning_rate": 4.256107559746331e-06, "loss": 1.184, "step": 576 }, { "epoch": 0.27554918815663804, "grad_norm": 5.859102249145508, "learning_rate": 4.253353124797623e-06, "loss": 1.2216, "step": 577 }, { "epoch": 0.27602674307545366, "grad_norm": 5.038761138916016, "learning_rate": 4.2505944946823995e-06, "loss": 1.6422, "step": 578 }, { "epoch": 0.27650429799426934, "grad_norm": 4.771734714508057, "learning_rate": 4.247831676001107e-06, "loss": 2.1868, "step": 579 }, { "epoch": 0.276981852913085, "grad_norm": 4.5896992683410645, "learning_rate": 4.245064675364211e-06, "loss": 1.2472, "step": 580 }, { "epoch": 0.27745940783190065, "grad_norm": 4.744195461273193, "learning_rate": 4.2422934993921875e-06, "loss": 1.112, "step": 581 }, { "epoch": 0.27793696275071633, "grad_norm": 5.154059410095215, "learning_rate": 4.239518154715499e-06, "loss": 1.438, "step": 582 }, { "epoch": 0.278414517669532, "grad_norm": 5.074432849884033, "learning_rate": 4.236738647974585e-06, "loss": 0.8193, "step": 583 }, { "epoch": 0.27889207258834764, "grad_norm": 4.032737731933594, "learning_rate": 4.233954985819842e-06, "loss": 1.6441, "step": 584 }, { "epoch": 0.2793696275071633, "grad_norm": 4.43956995010376, "learning_rate": 4.2311671749116065e-06, "loss": 1.4659, "step": 585 }, { "epoch": 0.279847182425979, "grad_norm": 3.909801483154297, "learning_rate": 4.228375221920147e-06, "loss": 1.1283, "step": 586 }, { "epoch": 0.28032473734479463, "grad_norm": 4.736994743347168, "learning_rate": 4.225579133525639e-06, "loss": 1.0179, "step": 587 }, { "epoch": 0.2808022922636103, "grad_norm": 4.356681823730469, "learning_rate": 4.222778916418153e-06, "loss": 0.8533, "step": 588 }, { "epoch": 0.281279847182426, "grad_norm": 4.157984733581543, "learning_rate": 4.219974577297638e-06, "loss": 1.3748, "step": 589 }, { "epoch": 0.2817574021012416, "grad_norm": 4.692636966705322, "learning_rate": 4.217166122873906e-06, "loss": 1.0299, "step": 590 }, { "epoch": 0.2822349570200573, "grad_norm": 5.855917453765869, "learning_rate": 4.214353559866615e-06, "loss": 1.748, "step": 591 }, { "epoch": 0.282712511938873, "grad_norm": 5.090888977050781, "learning_rate": 4.211536895005254e-06, "loss": 1.8577, "step": 592 }, { "epoch": 0.28319006685768866, "grad_norm": 4.826634883880615, "learning_rate": 4.208716135029127e-06, "loss": 1.4539, "step": 593 }, { "epoch": 0.2836676217765043, "grad_norm": 4.872241020202637, "learning_rate": 4.205891286687334e-06, "loss": 1.3659, "step": 594 }, { "epoch": 0.28414517669531997, "grad_norm": 3.5692296028137207, "learning_rate": 4.203062356738758e-06, "loss": 1.2517, "step": 595 }, { "epoch": 0.28462273161413565, "grad_norm": 5.363955497741699, "learning_rate": 4.20022935195205e-06, "loss": 1.5275, "step": 596 }, { "epoch": 0.2851002865329513, "grad_norm": 5.285757064819336, "learning_rate": 4.197392279105608e-06, "loss": 1.1208, "step": 597 }, { "epoch": 0.28557784145176696, "grad_norm": 6.417721271514893, "learning_rate": 4.1945511449875645e-06, "loss": 0.9011, "step": 598 }, { "epoch": 0.28605539637058264, "grad_norm": 4.27297306060791, "learning_rate": 4.191705956395768e-06, "loss": 1.9863, "step": 599 }, { "epoch": 0.28653295128939826, "grad_norm": 4.627712726593018, "learning_rate": 4.1888567201377694e-06, "loss": 1.5375, "step": 600 }, { "epoch": 0.28701050620821394, "grad_norm": 3.8662869930267334, "learning_rate": 4.186003443030805e-06, "loss": 1.4924, "step": 601 }, { "epoch": 0.2874880611270296, "grad_norm": 4.373458385467529, "learning_rate": 4.183146131901777e-06, "loss": 1.2447, "step": 602 }, { "epoch": 0.28796561604584525, "grad_norm": 4.358523368835449, "learning_rate": 4.180284793587242e-06, "loss": 0.7605, "step": 603 }, { "epoch": 0.28844317096466093, "grad_norm": 4.019674301147461, "learning_rate": 4.177419434933389e-06, "loss": 0.9846, "step": 604 }, { "epoch": 0.2889207258834766, "grad_norm": 4.882978439331055, "learning_rate": 4.1745500627960315e-06, "loss": 0.9718, "step": 605 }, { "epoch": 0.28939828080229224, "grad_norm": 4.630969524383545, "learning_rate": 4.17167668404058e-06, "loss": 1.8742, "step": 606 }, { "epoch": 0.2898758357211079, "grad_norm": 4.557861804962158, "learning_rate": 4.168799305542036e-06, "loss": 1.499, "step": 607 }, { "epoch": 0.2903533906399236, "grad_norm": 4.7020792961120605, "learning_rate": 4.1659179341849685e-06, "loss": 1.7369, "step": 608 }, { "epoch": 0.29083094555873923, "grad_norm": 5.034451007843018, "learning_rate": 4.163032576863503e-06, "loss": 1.7551, "step": 609 }, { "epoch": 0.2913085004775549, "grad_norm": 4.9759392738342285, "learning_rate": 4.1601432404812955e-06, "loss": 1.2575, "step": 610 }, { "epoch": 0.2917860553963706, "grad_norm": 5.029648780822754, "learning_rate": 4.157249931951531e-06, "loss": 0.8313, "step": 611 }, { "epoch": 0.2922636103151863, "grad_norm": 4.342098236083984, "learning_rate": 4.1543526581968915e-06, "loss": 0.9838, "step": 612 }, { "epoch": 0.2927411652340019, "grad_norm": 4.268614768981934, "learning_rate": 4.151451426149552e-06, "loss": 0.9139, "step": 613 }, { "epoch": 0.2932187201528176, "grad_norm": 4.73753547668457, "learning_rate": 4.148546242751152e-06, "loss": 1.3795, "step": 614 }, { "epoch": 0.29369627507163326, "grad_norm": 4.544521331787109, "learning_rate": 4.145637114952792e-06, "loss": 1.6922, "step": 615 }, { "epoch": 0.2941738299904489, "grad_norm": 3.6493802070617676, "learning_rate": 4.142724049715005e-06, "loss": 1.3142, "step": 616 }, { "epoch": 0.29465138490926457, "grad_norm": 4.946296691894531, "learning_rate": 4.139807054007748e-06, "loss": 1.5298, "step": 617 }, { "epoch": 0.29512893982808025, "grad_norm": 4.1573381423950195, "learning_rate": 4.136886134810379e-06, "loss": 1.3545, "step": 618 }, { "epoch": 0.2956064947468959, "grad_norm": 5.878752708435059, "learning_rate": 4.133961299111648e-06, "loss": 1.2685, "step": 619 }, { "epoch": 0.29608404966571156, "grad_norm": 4.699491024017334, "learning_rate": 4.131032553909673e-06, "loss": 1.5852, "step": 620 }, { "epoch": 0.29656160458452724, "grad_norm": 4.7188496589660645, "learning_rate": 4.128099906211926e-06, "loss": 0.8965, "step": 621 }, { "epoch": 0.29703915950334286, "grad_norm": 5.116876602172852, "learning_rate": 4.125163363035215e-06, "loss": 0.8153, "step": 622 }, { "epoch": 0.29751671442215855, "grad_norm": 4.7289276123046875, "learning_rate": 4.122222931405672e-06, "loss": 1.8729, "step": 623 }, { "epoch": 0.2979942693409742, "grad_norm": 4.712724685668945, "learning_rate": 4.119278618358733e-06, "loss": 1.1019, "step": 624 }, { "epoch": 0.29847182425978985, "grad_norm": 6.648689270019531, "learning_rate": 4.116330430939116e-06, "loss": 1.9675, "step": 625 }, { "epoch": 0.29894937917860553, "grad_norm": 4.456351280212402, "learning_rate": 4.113378376200813e-06, "loss": 1.7418, "step": 626 }, { "epoch": 0.2994269340974212, "grad_norm": 5.174410343170166, "learning_rate": 4.110422461207069e-06, "loss": 0.9055, "step": 627 }, { "epoch": 0.29990448901623684, "grad_norm": 4.009849548339844, "learning_rate": 4.107462693030362e-06, "loss": 1.429, "step": 628 }, { "epoch": 0.3003820439350525, "grad_norm": 4.785384178161621, "learning_rate": 4.1044990787523945e-06, "loss": 1.5216, "step": 629 }, { "epoch": 0.3008595988538682, "grad_norm": 4.5311408042907715, "learning_rate": 4.101531625464067e-06, "loss": 1.8935, "step": 630 }, { "epoch": 0.3013371537726839, "grad_norm": 5.222896575927734, "learning_rate": 4.098560340265468e-06, "loss": 2.0827, "step": 631 }, { "epoch": 0.3018147086914995, "grad_norm": 4.932816505432129, "learning_rate": 4.0955852302658525e-06, "loss": 1.3749, "step": 632 }, { "epoch": 0.3022922636103152, "grad_norm": 5.088895320892334, "learning_rate": 4.092606302583629e-06, "loss": 1.3096, "step": 633 }, { "epoch": 0.3027698185291309, "grad_norm": 3.5982251167297363, "learning_rate": 4.0896235643463375e-06, "loss": 1.1598, "step": 634 }, { "epoch": 0.3032473734479465, "grad_norm": 4.796620845794678, "learning_rate": 4.086637022690639e-06, "loss": 0.9108, "step": 635 }, { "epoch": 0.3037249283667622, "grad_norm": 4.934738636016846, "learning_rate": 4.083646684762292e-06, "loss": 1.1227, "step": 636 }, { "epoch": 0.30420248328557786, "grad_norm": 5.639143943786621, "learning_rate": 4.08065255771614e-06, "loss": 1.5454, "step": 637 }, { "epoch": 0.3046800382043935, "grad_norm": 4.362215042114258, "learning_rate": 4.07765464871609e-06, "loss": 2.1343, "step": 638 }, { "epoch": 0.30515759312320917, "grad_norm": 5.12257719039917, "learning_rate": 4.0746529649351e-06, "loss": 1.6036, "step": 639 }, { "epoch": 0.30563514804202485, "grad_norm": 6.975855827331543, "learning_rate": 4.071647513555161e-06, "loss": 1.2016, "step": 640 }, { "epoch": 0.3061127029608405, "grad_norm": 4.4195556640625, "learning_rate": 4.068638301767274e-06, "loss": 1.2461, "step": 641 }, { "epoch": 0.30659025787965616, "grad_norm": 5.087890148162842, "learning_rate": 4.065625336771441e-06, "loss": 2.1123, "step": 642 }, { "epoch": 0.30706781279847184, "grad_norm": 3.984546184539795, "learning_rate": 4.062608625776645e-06, "loss": 1.5736, "step": 643 }, { "epoch": 0.30754536771728747, "grad_norm": 4.1928935050964355, "learning_rate": 4.059588176000829e-06, "loss": 1.1823, "step": 644 }, { "epoch": 0.30802292263610315, "grad_norm": 5.040088176727295, "learning_rate": 4.056563994670882e-06, "loss": 1.1504, "step": 645 }, { "epoch": 0.30850047755491883, "grad_norm": 4.519896984100342, "learning_rate": 4.053536089022624e-06, "loss": 1.6966, "step": 646 }, { "epoch": 0.30897803247373445, "grad_norm": 4.228121280670166, "learning_rate": 4.050504466300782e-06, "loss": 2.1324, "step": 647 }, { "epoch": 0.30945558739255014, "grad_norm": 4.232161045074463, "learning_rate": 4.04746913375898e-06, "loss": 1.2915, "step": 648 }, { "epoch": 0.3099331423113658, "grad_norm": 5.123937606811523, "learning_rate": 4.044430098659716e-06, "loss": 1.0365, "step": 649 }, { "epoch": 0.3104106972301815, "grad_norm": 4.26357889175415, "learning_rate": 4.04138736827435e-06, "loss": 1.4498, "step": 650 }, { "epoch": 0.3108882521489971, "grad_norm": 4.727969169616699, "learning_rate": 4.0383409498830794e-06, "loss": 1.5501, "step": 651 }, { "epoch": 0.3113658070678128, "grad_norm": 3.726792097091675, "learning_rate": 4.035290850774929e-06, "loss": 1.0352, "step": 652 }, { "epoch": 0.3118433619866285, "grad_norm": 4.404326438903809, "learning_rate": 4.032237078247729e-06, "loss": 1.345, "step": 653 }, { "epoch": 0.3123209169054441, "grad_norm": 5.389330863952637, "learning_rate": 4.029179639608099e-06, "loss": 1.2705, "step": 654 }, { "epoch": 0.3127984718242598, "grad_norm": 4.135141372680664, "learning_rate": 4.026118542171431e-06, "loss": 1.101, "step": 655 }, { "epoch": 0.3132760267430755, "grad_norm": 4.916487216949463, "learning_rate": 4.023053793261869e-06, "loss": 1.8853, "step": 656 }, { "epoch": 0.3137535816618911, "grad_norm": 4.3705573081970215, "learning_rate": 4.019985400212295e-06, "loss": 1.4882, "step": 657 }, { "epoch": 0.3142311365807068, "grad_norm": 6.1238579750061035, "learning_rate": 4.0169133703643136e-06, "loss": 0.8893, "step": 658 }, { "epoch": 0.31470869149952246, "grad_norm": 4.877721786499023, "learning_rate": 4.013837711068224e-06, "loss": 1.931, "step": 659 }, { "epoch": 0.3151862464183381, "grad_norm": 5.948941707611084, "learning_rate": 4.010758429683015e-06, "loss": 1.2175, "step": 660 }, { "epoch": 0.31566380133715377, "grad_norm": 4.653228759765625, "learning_rate": 4.00767553357634e-06, "loss": 1.1461, "step": 661 }, { "epoch": 0.31614135625596945, "grad_norm": 4.553239822387695, "learning_rate": 4.004589030124502e-06, "loss": 1.7088, "step": 662 }, { "epoch": 0.3166189111747851, "grad_norm": 4.689153671264648, "learning_rate": 4.001498926712432e-06, "loss": 1.8511, "step": 663 }, { "epoch": 0.31709646609360076, "grad_norm": 4.260040283203125, "learning_rate": 3.998405230733679e-06, "loss": 1.3464, "step": 664 }, { "epoch": 0.31757402101241644, "grad_norm": 4.018008708953857, "learning_rate": 3.995307949590385e-06, "loss": 0.8352, "step": 665 }, { "epoch": 0.31805157593123207, "grad_norm": 4.840106010437012, "learning_rate": 3.99220709069327e-06, "loss": 0.861, "step": 666 }, { "epoch": 0.31852913085004775, "grad_norm": 4.8461384773254395, "learning_rate": 3.989102661461615e-06, "loss": 1.0834, "step": 667 }, { "epoch": 0.31900668576886343, "grad_norm": 5.434104919433594, "learning_rate": 3.985994669323244e-06, "loss": 1.6911, "step": 668 }, { "epoch": 0.3194842406876791, "grad_norm": 5.421936988830566, "learning_rate": 3.982883121714506e-06, "loss": 2.1134, "step": 669 }, { "epoch": 0.31996179560649474, "grad_norm": 4.048985481262207, "learning_rate": 3.979768026080255e-06, "loss": 1.3006, "step": 670 }, { "epoch": 0.3204393505253104, "grad_norm": 5.564789772033691, "learning_rate": 3.976649389873835e-06, "loss": 0.8491, "step": 671 }, { "epoch": 0.3209169054441261, "grad_norm": 4.539333343505859, "learning_rate": 3.973527220557064e-06, "loss": 0.9854, "step": 672 }, { "epoch": 0.3213944603629417, "grad_norm": 6.064375400543213, "learning_rate": 3.97040152560021e-06, "loss": 1.8732, "step": 673 }, { "epoch": 0.3218720152817574, "grad_norm": 4.763433456420898, "learning_rate": 3.9672723124819775e-06, "loss": 1.6722, "step": 674 }, { "epoch": 0.3223495702005731, "grad_norm": 5.5994110107421875, "learning_rate": 3.964139588689491e-06, "loss": 1.6984, "step": 675 }, { "epoch": 0.3228271251193887, "grad_norm": 3.460251808166504, "learning_rate": 3.961003361718272e-06, "loss": 1.2185, "step": 676 }, { "epoch": 0.3233046800382044, "grad_norm": 4.309027671813965, "learning_rate": 3.9578636390722246e-06, "loss": 1.3725, "step": 677 }, { "epoch": 0.3237822349570201, "grad_norm": 4.71614408493042, "learning_rate": 3.954720428263617e-06, "loss": 1.4777, "step": 678 }, { "epoch": 0.3242597898758357, "grad_norm": 5.219738006591797, "learning_rate": 3.951573736813066e-06, "loss": 0.8638, "step": 679 }, { "epoch": 0.3247373447946514, "grad_norm": 5.138872146606445, "learning_rate": 3.948423572249511e-06, "loss": 0.6795, "step": 680 }, { "epoch": 0.32521489971346706, "grad_norm": 4.259987831115723, "learning_rate": 3.945269942110207e-06, "loss": 0.7645, "step": 681 }, { "epoch": 0.3256924546322827, "grad_norm": 5.232902526855469, "learning_rate": 3.942112853940696e-06, "loss": 1.4876, "step": 682 }, { "epoch": 0.32617000955109837, "grad_norm": 5.722768783569336, "learning_rate": 3.938952315294797e-06, "loss": 1.5428, "step": 683 }, { "epoch": 0.32664756446991405, "grad_norm": 4.588662147521973, "learning_rate": 3.935788333734584e-06, "loss": 1.0623, "step": 684 }, { "epoch": 0.3271251193887297, "grad_norm": 5.261674404144287, "learning_rate": 3.932620916830368e-06, "loss": 1.0337, "step": 685 }, { "epoch": 0.32760267430754536, "grad_norm": 4.315485954284668, "learning_rate": 3.92945007216068e-06, "loss": 1.3244, "step": 686 }, { "epoch": 0.32808022922636104, "grad_norm": 4.549139022827148, "learning_rate": 3.9262758073122545e-06, "loss": 1.5165, "step": 687 }, { "epoch": 0.32855778414517667, "grad_norm": 5.933612823486328, "learning_rate": 3.9230981298800055e-06, "loss": 1.9108, "step": 688 }, { "epoch": 0.32903533906399235, "grad_norm": 5.067174434661865, "learning_rate": 3.919917047467016e-06, "loss": 1.1432, "step": 689 }, { "epoch": 0.32951289398280803, "grad_norm": 4.739798545837402, "learning_rate": 3.916732567684511e-06, "loss": 1.4415, "step": 690 }, { "epoch": 0.3299904489016237, "grad_norm": 4.410865783691406, "learning_rate": 3.91354469815185e-06, "loss": 1.1805, "step": 691 }, { "epoch": 0.33046800382043934, "grad_norm": 5.247057914733887, "learning_rate": 3.910353446496499e-06, "loss": 1.1287, "step": 692 }, { "epoch": 0.330945558739255, "grad_norm": 5.094728946685791, "learning_rate": 3.907158820354018e-06, "loss": 0.8312, "step": 693 }, { "epoch": 0.3314231136580707, "grad_norm": 4.8783745765686035, "learning_rate": 3.903960827368041e-06, "loss": 1.2853, "step": 694 }, { "epoch": 0.3319006685768863, "grad_norm": 4.086891174316406, "learning_rate": 3.900759475190254e-06, "loss": 1.4697, "step": 695 }, { "epoch": 0.332378223495702, "grad_norm": 4.478697776794434, "learning_rate": 3.897554771480388e-06, "loss": 1.909, "step": 696 }, { "epoch": 0.3328557784145177, "grad_norm": 6.32254695892334, "learning_rate": 3.894346723906186e-06, "loss": 1.9007, "step": 697 }, { "epoch": 0.3333333333333333, "grad_norm": 4.328856945037842, "learning_rate": 3.891135340143395e-06, "loss": 1.1091, "step": 698 }, { "epoch": 0.333810888252149, "grad_norm": 4.449676513671875, "learning_rate": 3.887920627875743e-06, "loss": 1.0219, "step": 699 }, { "epoch": 0.3342884431709647, "grad_norm": 7.468997001647949, "learning_rate": 3.884702594794924e-06, "loss": 1.2569, "step": 700 }, { "epoch": 0.3347659980897803, "grad_norm": 5.675698280334473, "learning_rate": 3.881481248600574e-06, "loss": 1.7019, "step": 701 }, { "epoch": 0.335243553008596, "grad_norm": 5.159004211425781, "learning_rate": 3.87825659700026e-06, "loss": 2.0062, "step": 702 }, { "epoch": 0.33572110792741167, "grad_norm": 4.614938259124756, "learning_rate": 3.875028647709456e-06, "loss": 1.6691, "step": 703 }, { "epoch": 0.3361986628462273, "grad_norm": 5.394142150878906, "learning_rate": 3.871797408451525e-06, "loss": 1.6568, "step": 704 }, { "epoch": 0.336676217765043, "grad_norm": 4.103569507598877, "learning_rate": 3.868562886957704e-06, "loss": 1.5787, "step": 705 }, { "epoch": 0.33715377268385865, "grad_norm": 4.580273628234863, "learning_rate": 3.8653250909670815e-06, "loss": 0.8699, "step": 706 }, { "epoch": 0.3376313276026743, "grad_norm": 4.134293556213379, "learning_rate": 3.862084028226583e-06, "loss": 1.3563, "step": 707 }, { "epoch": 0.33810888252148996, "grad_norm": 4.829638957977295, "learning_rate": 3.858839706490946e-06, "loss": 1.1685, "step": 708 }, { "epoch": 0.33858643744030564, "grad_norm": 4.73740816116333, "learning_rate": 3.855592133522711e-06, "loss": 1.3752, "step": 709 }, { "epoch": 0.3390639923591213, "grad_norm": 4.271519184112549, "learning_rate": 3.852341317092193e-06, "loss": 1.3084, "step": 710 }, { "epoch": 0.33954154727793695, "grad_norm": 5.289199352264404, "learning_rate": 3.849087264977471e-06, "loss": 1.9092, "step": 711 }, { "epoch": 0.34001910219675263, "grad_norm": 4.938006401062012, "learning_rate": 3.845829984964362e-06, "loss": 0.8482, "step": 712 }, { "epoch": 0.3404966571155683, "grad_norm": 4.5070576667785645, "learning_rate": 3.842569484846411e-06, "loss": 1.5858, "step": 713 }, { "epoch": 0.34097421203438394, "grad_norm": 4.818379878997803, "learning_rate": 3.839305772424862e-06, "loss": 1.6605, "step": 714 }, { "epoch": 0.3414517669531996, "grad_norm": 4.776932716369629, "learning_rate": 3.836038855508652e-06, "loss": 1.0068, "step": 715 }, { "epoch": 0.3419293218720153, "grad_norm": 5.210662364959717, "learning_rate": 3.832768741914378e-06, "loss": 1.3681, "step": 716 }, { "epoch": 0.3424068767908309, "grad_norm": 5.149065971374512, "learning_rate": 3.8294954394662895e-06, "loss": 1.5289, "step": 717 }, { "epoch": 0.3428844317096466, "grad_norm": 4.124396800994873, "learning_rate": 3.826218955996267e-06, "loss": 1.6143, "step": 718 }, { "epoch": 0.3433619866284623, "grad_norm": 5.491997241973877, "learning_rate": 3.822939299343798e-06, "loss": 1.5779, "step": 719 }, { "epoch": 0.3438395415472779, "grad_norm": 5.334607124328613, "learning_rate": 3.819656477355964e-06, "loss": 1.5599, "step": 720 }, { "epoch": 0.3443170964660936, "grad_norm": 4.630972385406494, "learning_rate": 3.8163704978874225e-06, "loss": 1.5003, "step": 721 }, { "epoch": 0.3447946513849093, "grad_norm": 5.669166088104248, "learning_rate": 3.813081368800383e-06, "loss": 1.1851, "step": 722 }, { "epoch": 0.3452722063037249, "grad_norm": 4.499358177185059, "learning_rate": 3.8097890979645915e-06, "loss": 0.958, "step": 723 }, { "epoch": 0.3457497612225406, "grad_norm": 4.8701863288879395, "learning_rate": 3.8064936932573114e-06, "loss": 1.3947, "step": 724 }, { "epoch": 0.34622731614135627, "grad_norm": 4.242757797241211, "learning_rate": 3.8031951625633056e-06, "loss": 1.96, "step": 725 }, { "epoch": 0.3467048710601719, "grad_norm": 5.225197792053223, "learning_rate": 3.799893513774814e-06, "loss": 1.7306, "step": 726 }, { "epoch": 0.3471824259789876, "grad_norm": 3.947305917739868, "learning_rate": 3.796588754791538e-06, "loss": 1.3048, "step": 727 }, { "epoch": 0.34765998089780326, "grad_norm": 4.260103702545166, "learning_rate": 3.793280893520621e-06, "loss": 1.4348, "step": 728 }, { "epoch": 0.34813753581661894, "grad_norm": 4.2620649337768555, "learning_rate": 3.78996993787663e-06, "loss": 0.6933, "step": 729 }, { "epoch": 0.34861509073543456, "grad_norm": 4.6975579261779785, "learning_rate": 3.7866558957815335e-06, "loss": 1.8948, "step": 730 }, { "epoch": 0.34909264565425024, "grad_norm": 4.036712646484375, "learning_rate": 3.7833387751646856e-06, "loss": 1.4971, "step": 731 }, { "epoch": 0.3495702005730659, "grad_norm": 4.819871425628662, "learning_rate": 3.780018583962807e-06, "loss": 0.9638, "step": 732 }, { "epoch": 0.35004775549188155, "grad_norm": 4.505753517150879, "learning_rate": 3.776695330119966e-06, "loss": 1.527, "step": 733 }, { "epoch": 0.35052531041069723, "grad_norm": 4.341352462768555, "learning_rate": 3.773369021587556e-06, "loss": 1.3213, "step": 734 }, { "epoch": 0.3510028653295129, "grad_norm": 4.35957670211792, "learning_rate": 3.7700396663242823e-06, "loss": 1.6424, "step": 735 }, { "epoch": 0.35148042024832854, "grad_norm": 5.156408786773682, "learning_rate": 3.7667072722961363e-06, "loss": 1.2449, "step": 736 }, { "epoch": 0.3519579751671442, "grad_norm": 4.833114147186279, "learning_rate": 3.7633718474763843e-06, "loss": 1.8941, "step": 737 }, { "epoch": 0.3524355300859599, "grad_norm": 4.8638482093811035, "learning_rate": 3.7600333998455415e-06, "loss": 1.0776, "step": 738 }, { "epoch": 0.35291308500477553, "grad_norm": 3.8685362339019775, "learning_rate": 3.7566919373913557e-06, "loss": 1.5481, "step": 739 }, { "epoch": 0.3533906399235912, "grad_norm": 4.667938709259033, "learning_rate": 3.7533474681087907e-06, "loss": 1.3883, "step": 740 }, { "epoch": 0.3538681948424069, "grad_norm": 5.376291275024414, "learning_rate": 3.7500000000000005e-06, "loss": 1.5528, "step": 741 }, { "epoch": 0.3543457497612225, "grad_norm": 4.806412696838379, "learning_rate": 3.7466495410743177e-06, "loss": 2.065, "step": 742 }, { "epoch": 0.3548233046800382, "grad_norm": 4.1322431564331055, "learning_rate": 3.7432960993482294e-06, "loss": 1.2539, "step": 743 }, { "epoch": 0.3553008595988539, "grad_norm": 4.96150016784668, "learning_rate": 3.7399396828453593e-06, "loss": 1.6599, "step": 744 }, { "epoch": 0.3557784145176695, "grad_norm": 4.211765289306641, "learning_rate": 3.736580299596449e-06, "loss": 1.4249, "step": 745 }, { "epoch": 0.3562559694364852, "grad_norm": 4.3064351081848145, "learning_rate": 3.7332179576393395e-06, "loss": 1.7154, "step": 746 }, { "epoch": 0.35673352435530087, "grad_norm": 4.858706474304199, "learning_rate": 3.72985266501895e-06, "loss": 1.416, "step": 747 }, { "epoch": 0.35721107927411655, "grad_norm": 4.710183620452881, "learning_rate": 3.7264844297872595e-06, "loss": 1.6768, "step": 748 }, { "epoch": 0.3576886341929322, "grad_norm": 4.441434860229492, "learning_rate": 3.723113260003287e-06, "loss": 1.1553, "step": 749 }, { "epoch": 0.35816618911174786, "grad_norm": 4.74830961227417, "learning_rate": 3.7197391637330753e-06, "loss": 1.7318, "step": 750 }, { "epoch": 0.35864374403056354, "grad_norm": 4.390340328216553, "learning_rate": 3.7163621490496665e-06, "loss": 1.3398, "step": 751 }, { "epoch": 0.35912129894937916, "grad_norm": 4.695949554443359, "learning_rate": 3.712982224033088e-06, "loss": 1.1525, "step": 752 }, { "epoch": 0.35959885386819485, "grad_norm": 4.047895431518555, "learning_rate": 3.709599396770327e-06, "loss": 0.8157, "step": 753 }, { "epoch": 0.3600764087870105, "grad_norm": 4.43308687210083, "learning_rate": 3.7062136753553192e-06, "loss": 1.0274, "step": 754 }, { "epoch": 0.36055396370582615, "grad_norm": 5.866445064544678, "learning_rate": 3.702825067888921e-06, "loss": 1.4409, "step": 755 }, { "epoch": 0.36103151862464183, "grad_norm": 4.935059070587158, "learning_rate": 3.6994335824788973e-06, "loss": 1.6409, "step": 756 }, { "epoch": 0.3615090735434575, "grad_norm": 5.251873016357422, "learning_rate": 3.6960392272398983e-06, "loss": 1.4397, "step": 757 }, { "epoch": 0.36198662846227314, "grad_norm": 4.926359176635742, "learning_rate": 3.6926420102934378e-06, "loss": 1.5812, "step": 758 }, { "epoch": 0.3624641833810888, "grad_norm": 4.3319807052612305, "learning_rate": 3.6892419397678806e-06, "loss": 1.6245, "step": 759 }, { "epoch": 0.3629417382999045, "grad_norm": 4.833469390869141, "learning_rate": 3.6858390237984164e-06, "loss": 1.2629, "step": 760 }, { "epoch": 0.36341929321872013, "grad_norm": 4.069488048553467, "learning_rate": 3.682433270527045e-06, "loss": 2.0717, "step": 761 }, { "epoch": 0.3638968481375358, "grad_norm": 5.567584037780762, "learning_rate": 3.6790246881025533e-06, "loss": 1.6445, "step": 762 }, { "epoch": 0.3643744030563515, "grad_norm": 6.153409957885742, "learning_rate": 3.675613284680498e-06, "loss": 1.0222, "step": 763 }, { "epoch": 0.3648519579751671, "grad_norm": 4.511119842529297, "learning_rate": 3.672199068423185e-06, "loss": 1.4709, "step": 764 }, { "epoch": 0.3653295128939828, "grad_norm": 6.564058780670166, "learning_rate": 3.668782047499652e-06, "loss": 1.2396, "step": 765 }, { "epoch": 0.3658070678127985, "grad_norm": 4.125149250030518, "learning_rate": 3.665362230085646e-06, "loss": 1.7066, "step": 766 }, { "epoch": 0.36628462273161416, "grad_norm": 4.489935398101807, "learning_rate": 3.6619396243636035e-06, "loss": 1.6375, "step": 767 }, { "epoch": 0.3667621776504298, "grad_norm": 5.2286882400512695, "learning_rate": 3.658514238522636e-06, "loss": 1.8172, "step": 768 }, { "epoch": 0.36723973256924547, "grad_norm": 4.341921806335449, "learning_rate": 3.655086080758504e-06, "loss": 1.2966, "step": 769 }, { "epoch": 0.36771728748806115, "grad_norm": 4.891939163208008, "learning_rate": 3.651655159273602e-06, "loss": 1.476, "step": 770 }, { "epoch": 0.3681948424068768, "grad_norm": 4.665860176086426, "learning_rate": 3.6482214822769356e-06, "loss": 0.8972, "step": 771 }, { "epoch": 0.36867239732569246, "grad_norm": 4.064877510070801, "learning_rate": 3.6447850579841038e-06, "loss": 2.2865, "step": 772 }, { "epoch": 0.36914995224450814, "grad_norm": 5.185751914978027, "learning_rate": 3.6413458946172804e-06, "loss": 1.6053, "step": 773 }, { "epoch": 0.36962750716332377, "grad_norm": 4.846790790557861, "learning_rate": 3.6379040004051914e-06, "loss": 1.0578, "step": 774 }, { "epoch": 0.37010506208213945, "grad_norm": 4.712649822235107, "learning_rate": 3.6344593835830965e-06, "loss": 1.6367, "step": 775 }, { "epoch": 0.37058261700095513, "grad_norm": 4.83684778213501, "learning_rate": 3.6310120523927706e-06, "loss": 2.0013, "step": 776 }, { "epoch": 0.37106017191977075, "grad_norm": 3.9546871185302734, "learning_rate": 3.627562015082483e-06, "loss": 1.1291, "step": 777 }, { "epoch": 0.37153772683858644, "grad_norm": 4.495453834533691, "learning_rate": 3.6241092799069767e-06, "loss": 1.6433, "step": 778 }, { "epoch": 0.3720152817574021, "grad_norm": 4.511261940002441, "learning_rate": 3.620653855127452e-06, "loss": 0.7325, "step": 779 }, { "epoch": 0.37249283667621774, "grad_norm": 4.048483848571777, "learning_rate": 3.617195749011543e-06, "loss": 2.2217, "step": 780 }, { "epoch": 0.3729703915950334, "grad_norm": 6.855978488922119, "learning_rate": 3.6137349698332974e-06, "loss": 1.0449, "step": 781 }, { "epoch": 0.3734479465138491, "grad_norm": 5.942322254180908, "learning_rate": 3.610271525873163e-06, "loss": 1.3303, "step": 782 }, { "epoch": 0.37392550143266473, "grad_norm": 4.652106761932373, "learning_rate": 3.6068054254179604e-06, "loss": 1.0881, "step": 783 }, { "epoch": 0.3744030563514804, "grad_norm": 4.822751522064209, "learning_rate": 3.6033366767608668e-06, "loss": 1.0463, "step": 784 }, { "epoch": 0.3748806112702961, "grad_norm": 4.665088653564453, "learning_rate": 3.5998652882013964e-06, "loss": 1.5464, "step": 785 }, { "epoch": 0.3753581661891118, "grad_norm": 4.67440128326416, "learning_rate": 3.596391268045378e-06, "loss": 1.3615, "step": 786 }, { "epoch": 0.3758357211079274, "grad_norm": 5.123157024383545, "learning_rate": 3.5929146246049395e-06, "loss": 1.2911, "step": 787 }, { "epoch": 0.3763132760267431, "grad_norm": 5.263867378234863, "learning_rate": 3.5894353661984836e-06, "loss": 1.2675, "step": 788 }, { "epoch": 0.37679083094555876, "grad_norm": 3.739556312561035, "learning_rate": 3.5859535011506696e-06, "loss": 1.522, "step": 789 }, { "epoch": 0.3772683858643744, "grad_norm": 5.049498081207275, "learning_rate": 3.5824690377923953e-06, "loss": 1.4562, "step": 790 }, { "epoch": 0.37774594078319007, "grad_norm": 4.402045249938965, "learning_rate": 3.578981984460773e-06, "loss": 1.2728, "step": 791 }, { "epoch": 0.37822349570200575, "grad_norm": 4.970470905303955, "learning_rate": 3.575492349499115e-06, "loss": 1.5334, "step": 792 }, { "epoch": 0.3787010506208214, "grad_norm": 4.500193119049072, "learning_rate": 3.572000141256906e-06, "loss": 1.7622, "step": 793 }, { "epoch": 0.37917860553963706, "grad_norm": 4.652907371520996, "learning_rate": 3.568505368089792e-06, "loss": 1.4561, "step": 794 }, { "epoch": 0.37965616045845274, "grad_norm": 4.136557579040527, "learning_rate": 3.5650080383595544e-06, "loss": 0.9769, "step": 795 }, { "epoch": 0.38013371537726837, "grad_norm": 5.494387626647949, "learning_rate": 3.5615081604340905e-06, "loss": 1.6312, "step": 796 }, { "epoch": 0.38061127029608405, "grad_norm": 4.734730243682861, "learning_rate": 3.558005742687396e-06, "loss": 1.3403, "step": 797 }, { "epoch": 0.38108882521489973, "grad_norm": 3.2915220260620117, "learning_rate": 3.554500793499543e-06, "loss": 0.7546, "step": 798 }, { "epoch": 0.38156638013371535, "grad_norm": 5.203744411468506, "learning_rate": 3.5509933212566605e-06, "loss": 1.859, "step": 799 }, { "epoch": 0.38204393505253104, "grad_norm": 6.072694778442383, "learning_rate": 3.5474833343509146e-06, "loss": 1.3285, "step": 800 }, { "epoch": 0.3825214899713467, "grad_norm": 5.544677734375, "learning_rate": 3.5439708411804878e-06, "loss": 1.0271, "step": 801 }, { "epoch": 0.38299904489016234, "grad_norm": 4.890792369842529, "learning_rate": 3.5404558501495585e-06, "loss": 2.0042, "step": 802 }, { "epoch": 0.383476599808978, "grad_norm": 4.292891979217529, "learning_rate": 3.536938369668283e-06, "loss": 1.844, "step": 803 }, { "epoch": 0.3839541547277937, "grad_norm": 4.942714691162109, "learning_rate": 3.5334184081527728e-06, "loss": 1.9635, "step": 804 }, { "epoch": 0.3844317096466094, "grad_norm": 4.5008392333984375, "learning_rate": 3.529895974025077e-06, "loss": 1.3059, "step": 805 }, { "epoch": 0.384909264565425, "grad_norm": 5.06405782699585, "learning_rate": 3.52637107571316e-06, "loss": 1.1041, "step": 806 }, { "epoch": 0.3853868194842407, "grad_norm": 4.701263427734375, "learning_rate": 3.5228437216508805e-06, "loss": 0.6704, "step": 807 }, { "epoch": 0.3858643744030564, "grad_norm": 4.609885215759277, "learning_rate": 3.5193139202779763e-06, "loss": 0.965, "step": 808 }, { "epoch": 0.386341929321872, "grad_norm": 4.646522521972656, "learning_rate": 3.5157816800400383e-06, "loss": 1.9979, "step": 809 }, { "epoch": 0.3868194842406877, "grad_norm": 4.042009353637695, "learning_rate": 3.512247009388494e-06, "loss": 1.2349, "step": 810 }, { "epoch": 0.38729703915950336, "grad_norm": 4.078355312347412, "learning_rate": 3.508709916780585e-06, "loss": 0.9642, "step": 811 }, { "epoch": 0.387774594078319, "grad_norm": 6.360867977142334, "learning_rate": 3.5051704106793487e-06, "loss": 1.8818, "step": 812 }, { "epoch": 0.38825214899713467, "grad_norm": 5.9203782081604, "learning_rate": 3.5016284995535972e-06, "loss": 1.2544, "step": 813 }, { "epoch": 0.38872970391595035, "grad_norm": 4.275111198425293, "learning_rate": 3.4980841918778962e-06, "loss": 1.656, "step": 814 }, { "epoch": 0.389207258834766, "grad_norm": 4.479493141174316, "learning_rate": 3.4945374961325452e-06, "loss": 1.1082, "step": 815 }, { "epoch": 0.38968481375358166, "grad_norm": 4.1804118156433105, "learning_rate": 3.4909884208035594e-06, "loss": 1.7928, "step": 816 }, { "epoch": 0.39016236867239734, "grad_norm": 3.7209877967834473, "learning_rate": 3.487436974382646e-06, "loss": 1.0712, "step": 817 }, { "epoch": 0.39063992359121297, "grad_norm": 3.963365316390991, "learning_rate": 3.4838831653671854e-06, "loss": 0.6324, "step": 818 }, { "epoch": 0.39111747851002865, "grad_norm": 5.007797718048096, "learning_rate": 3.480327002260212e-06, "loss": 1.2627, "step": 819 }, { "epoch": 0.39159503342884433, "grad_norm": 4.087820053100586, "learning_rate": 3.4767684935703906e-06, "loss": 1.1878, "step": 820 }, { "epoch": 0.39207258834765996, "grad_norm": 4.321444988250732, "learning_rate": 3.473207647812001e-06, "loss": 1.5427, "step": 821 }, { "epoch": 0.39255014326647564, "grad_norm": 4.508161544799805, "learning_rate": 3.4696444735049117e-06, "loss": 1.2456, "step": 822 }, { "epoch": 0.3930276981852913, "grad_norm": 4.410786151885986, "learning_rate": 3.4660789791745665e-06, "loss": 1.5957, "step": 823 }, { "epoch": 0.39350525310410694, "grad_norm": 4.301436901092529, "learning_rate": 3.4625111733519562e-06, "loss": 2.3067, "step": 824 }, { "epoch": 0.3939828080229226, "grad_norm": 4.6565470695495605, "learning_rate": 3.458941064573604e-06, "loss": 0.9012, "step": 825 }, { "epoch": 0.3944603629417383, "grad_norm": 4.461031913757324, "learning_rate": 3.4553686613815436e-06, "loss": 1.5797, "step": 826 }, { "epoch": 0.394937917860554, "grad_norm": 4.102884769439697, "learning_rate": 3.4517939723232984e-06, "loss": 1.236, "step": 827 }, { "epoch": 0.3954154727793696, "grad_norm": 4.471217632293701, "learning_rate": 3.4482170059518605e-06, "loss": 1.536, "step": 828 }, { "epoch": 0.3958930276981853, "grad_norm": 4.185146808624268, "learning_rate": 3.444637770825671e-06, "loss": 0.9179, "step": 829 }, { "epoch": 0.396370582617001, "grad_norm": 5.280450820922852, "learning_rate": 3.4410562755085987e-06, "loss": 1.2902, "step": 830 }, { "epoch": 0.3968481375358166, "grad_norm": 4.707200050354004, "learning_rate": 3.437472528569922e-06, "loss": 2.0304, "step": 831 }, { "epoch": 0.3973256924546323, "grad_norm": 4.738354206085205, "learning_rate": 3.4338865385843044e-06, "loss": 0.7666, "step": 832 }, { "epoch": 0.39780324737344797, "grad_norm": 4.4541497230529785, "learning_rate": 3.4302983141317793e-06, "loss": 1.4424, "step": 833 }, { "epoch": 0.3982808022922636, "grad_norm": 4.620995998382568, "learning_rate": 3.4267078637977225e-06, "loss": 1.4093, "step": 834 }, { "epoch": 0.3987583572110793, "grad_norm": 5.041505813598633, "learning_rate": 3.423115196172839e-06, "loss": 1.0534, "step": 835 }, { "epoch": 0.39923591212989495, "grad_norm": 3.9645068645477295, "learning_rate": 3.4195203198531385e-06, "loss": 1.5864, "step": 836 }, { "epoch": 0.3997134670487106, "grad_norm": 4.01663875579834, "learning_rate": 3.415923243439912e-06, "loss": 1.2712, "step": 837 }, { "epoch": 0.40019102196752626, "grad_norm": 3.9850618839263916, "learning_rate": 3.4123239755397186e-06, "loss": 1.2365, "step": 838 }, { "epoch": 0.40066857688634194, "grad_norm": 4.2752814292907715, "learning_rate": 3.4087225247643583e-06, "loss": 2.2262, "step": 839 }, { "epoch": 0.40114613180515757, "grad_norm": 5.6303887367248535, "learning_rate": 3.4051188997308567e-06, "loss": 1.0119, "step": 840 }, { "epoch": 0.40162368672397325, "grad_norm": 5.094964504241943, "learning_rate": 3.4015131090614374e-06, "loss": 0.9961, "step": 841 }, { "epoch": 0.40210124164278893, "grad_norm": 3.832918882369995, "learning_rate": 3.39790516138351e-06, "loss": 2.1956, "step": 842 }, { "epoch": 0.40257879656160456, "grad_norm": 4.754687786102295, "learning_rate": 3.3942950653296424e-06, "loss": 1.5059, "step": 843 }, { "epoch": 0.40305635148042024, "grad_norm": 5.092840671539307, "learning_rate": 3.3906828295375443e-06, "loss": 1.5295, "step": 844 }, { "epoch": 0.4035339063992359, "grad_norm": 5.292074680328369, "learning_rate": 3.3870684626500443e-06, "loss": 1.6262, "step": 845 }, { "epoch": 0.4040114613180516, "grad_norm": 5.229471683502197, "learning_rate": 3.3834519733150696e-06, "loss": 1.2776, "step": 846 }, { "epoch": 0.4044890162368672, "grad_norm": 5.383548259735107, "learning_rate": 3.3798333701856255e-06, "loss": 1.0026, "step": 847 }, { "epoch": 0.4049665711556829, "grad_norm": 5.424675941467285, "learning_rate": 3.3762126619197766e-06, "loss": 1.1042, "step": 848 }, { "epoch": 0.4054441260744986, "grad_norm": 5.100111484527588, "learning_rate": 3.3725898571806226e-06, "loss": 1.4847, "step": 849 }, { "epoch": 0.4059216809933142, "grad_norm": 5.061873912811279, "learning_rate": 3.3689649646362804e-06, "loss": 1.123, "step": 850 }, { "epoch": 0.4063992359121299, "grad_norm": 3.763338327407837, "learning_rate": 3.365337992959862e-06, "loss": 1.7331, "step": 851 }, { "epoch": 0.4068767908309456, "grad_norm": 5.047019004821777, "learning_rate": 3.361708950829453e-06, "loss": 1.336, "step": 852 }, { "epoch": 0.4073543457497612, "grad_norm": 4.530722618103027, "learning_rate": 3.3580778469280944e-06, "loss": 1.6827, "step": 853 }, { "epoch": 0.4078319006685769, "grad_norm": 5.024570941925049, "learning_rate": 3.35444468994376e-06, "loss": 1.5718, "step": 854 }, { "epoch": 0.40830945558739257, "grad_norm": 4.829212188720703, "learning_rate": 3.3508094885693356e-06, "loss": 1.5533, "step": 855 }, { "epoch": 0.4087870105062082, "grad_norm": 5.577857971191406, "learning_rate": 3.3471722515025986e-06, "loss": 1.6316, "step": 856 }, { "epoch": 0.4092645654250239, "grad_norm": 4.800250053405762, "learning_rate": 3.343532987446196e-06, "loss": 1.2196, "step": 857 }, { "epoch": 0.40974212034383956, "grad_norm": 4.280922889709473, "learning_rate": 3.3398917051076273e-06, "loss": 1.9726, "step": 858 }, { "epoch": 0.4102196752626552, "grad_norm": 4.265641689300537, "learning_rate": 3.336248413199218e-06, "loss": 0.9964, "step": 859 }, { "epoch": 0.41069723018147086, "grad_norm": 4.568741798400879, "learning_rate": 3.3326031204381045e-06, "loss": 2.0363, "step": 860 }, { "epoch": 0.41117478510028654, "grad_norm": 5.851785659790039, "learning_rate": 3.328955835546208e-06, "loss": 1.5475, "step": 861 }, { "epoch": 0.41165234001910217, "grad_norm": 5.2452192306518555, "learning_rate": 3.32530656725022e-06, "loss": 1.4747, "step": 862 }, { "epoch": 0.41212989493791785, "grad_norm": 4.717533111572266, "learning_rate": 3.321655324281572e-06, "loss": 1.2593, "step": 863 }, { "epoch": 0.41260744985673353, "grad_norm": 4.616949558258057, "learning_rate": 3.3180021153764252e-06, "loss": 1.1335, "step": 864 }, { "epoch": 0.4130850047755492, "grad_norm": 4.721005916595459, "learning_rate": 3.3143469492756424e-06, "loss": 1.186, "step": 865 }, { "epoch": 0.41356255969436484, "grad_norm": 6.272445201873779, "learning_rate": 3.3106898347247698e-06, "loss": 1.7199, "step": 866 }, { "epoch": 0.4140401146131805, "grad_norm": 4.7680840492248535, "learning_rate": 3.307030780474014e-06, "loss": 1.3949, "step": 867 }, { "epoch": 0.4145176695319962, "grad_norm": 4.987722873687744, "learning_rate": 3.3033697952782264e-06, "loss": 1.542, "step": 868 }, { "epoch": 0.41499522445081183, "grad_norm": 5.012249946594238, "learning_rate": 3.299706887896874e-06, "loss": 1.8123, "step": 869 }, { "epoch": 0.4154727793696275, "grad_norm": 4.5919671058654785, "learning_rate": 3.2960420670940263e-06, "loss": 1.8674, "step": 870 }, { "epoch": 0.4159503342884432, "grad_norm": 5.366579532623291, "learning_rate": 3.2923753416383285e-06, "loss": 1.9182, "step": 871 }, { "epoch": 0.4164278892072588, "grad_norm": 4.606401443481445, "learning_rate": 3.288706720302986e-06, "loss": 1.7531, "step": 872 }, { "epoch": 0.4169054441260745, "grad_norm": 4.419997215270996, "learning_rate": 3.2850362118657363e-06, "loss": 1.2687, "step": 873 }, { "epoch": 0.4173829990448902, "grad_norm": 4.576455116271973, "learning_rate": 3.2813638251088357e-06, "loss": 1.2781, "step": 874 }, { "epoch": 0.4178605539637058, "grad_norm": 5.139517784118652, "learning_rate": 3.277689568819033e-06, "loss": 1.5643, "step": 875 }, { "epoch": 0.4183381088825215, "grad_norm": 5.737284183502197, "learning_rate": 3.2740134517875506e-06, "loss": 1.5704, "step": 876 }, { "epoch": 0.41881566380133717, "grad_norm": 4.504410743713379, "learning_rate": 3.270335482810062e-06, "loss": 1.736, "step": 877 }, { "epoch": 0.4192932187201528, "grad_norm": 5.068437576293945, "learning_rate": 3.266655670686674e-06, "loss": 1.4603, "step": 878 }, { "epoch": 0.4197707736389685, "grad_norm": 4.658647537231445, "learning_rate": 3.2629740242219003e-06, "loss": 1.8604, "step": 879 }, { "epoch": 0.42024832855778416, "grad_norm": 3.6902029514312744, "learning_rate": 3.2592905522246474e-06, "loss": 1.5716, "step": 880 }, { "epoch": 0.4207258834765998, "grad_norm": 5.357398509979248, "learning_rate": 3.255605263508186e-06, "loss": 1.8521, "step": 881 }, { "epoch": 0.42120343839541546, "grad_norm": 5.447018623352051, "learning_rate": 3.2519181668901344e-06, "loss": 1.6402, "step": 882 }, { "epoch": 0.42168099331423115, "grad_norm": 4.039346218109131, "learning_rate": 3.248229271192439e-06, "loss": 1.5974, "step": 883 }, { "epoch": 0.4221585482330468, "grad_norm": 5.012446403503418, "learning_rate": 3.244538585241349e-06, "loss": 1.1596, "step": 884 }, { "epoch": 0.42263610315186245, "grad_norm": 4.107654094696045, "learning_rate": 3.2408461178673955e-06, "loss": 1.501, "step": 885 }, { "epoch": 0.42311365807067813, "grad_norm": 4.1825127601623535, "learning_rate": 3.2371518779053744e-06, "loss": 0.7591, "step": 886 }, { "epoch": 0.4235912129894938, "grad_norm": 4.9862823486328125, "learning_rate": 3.2334558741943228e-06, "loss": 0.9753, "step": 887 }, { "epoch": 0.42406876790830944, "grad_norm": 4.711572170257568, "learning_rate": 3.2297581155774954e-06, "loss": 1.0498, "step": 888 }, { "epoch": 0.4245463228271251, "grad_norm": 5.950453758239746, "learning_rate": 3.2260586109023488e-06, "loss": 1.1949, "step": 889 }, { "epoch": 0.4250238777459408, "grad_norm": 4.926224231719971, "learning_rate": 3.2223573690205152e-06, "loss": 1.3587, "step": 890 }, { "epoch": 0.42550143266475643, "grad_norm": 4.946518898010254, "learning_rate": 3.218654398787783e-06, "loss": 1.5758, "step": 891 }, { "epoch": 0.4259789875835721, "grad_norm": 5.525102138519287, "learning_rate": 3.214949709064079e-06, "loss": 1.7254, "step": 892 }, { "epoch": 0.4264565425023878, "grad_norm": 4.334508895874023, "learning_rate": 3.211243308713441e-06, "loss": 1.3011, "step": 893 }, { "epoch": 0.4269340974212034, "grad_norm": 5.208389759063721, "learning_rate": 3.2075352066040004e-06, "loss": 1.7063, "step": 894 }, { "epoch": 0.4274116523400191, "grad_norm": 4.758481979370117, "learning_rate": 3.2038254116079614e-06, "loss": 1.6682, "step": 895 }, { "epoch": 0.4278892072588348, "grad_norm": 4.089632511138916, "learning_rate": 3.2001139326015774e-06, "loss": 1.6547, "step": 896 }, { "epoch": 0.4283667621776504, "grad_norm": 4.826939582824707, "learning_rate": 3.1964007784651326e-06, "loss": 1.1195, "step": 897 }, { "epoch": 0.4288443170964661, "grad_norm": 5.0969557762146, "learning_rate": 3.1926859580829174e-06, "loss": 1.9893, "step": 898 }, { "epoch": 0.42932187201528177, "grad_norm": 4.105995178222656, "learning_rate": 3.1889694803432103e-06, "loss": 1.8642, "step": 899 }, { "epoch": 0.4297994269340974, "grad_norm": 3.9060518741607666, "learning_rate": 3.185251354138255e-06, "loss": 0.6646, "step": 900 }, { "epoch": 0.4302769818529131, "grad_norm": 4.098819732666016, "learning_rate": 3.1815315883642388e-06, "loss": 1.6466, "step": 901 }, { "epoch": 0.43075453677172876, "grad_norm": 3.791785478591919, "learning_rate": 3.1778101919212734e-06, "loss": 0.8055, "step": 902 }, { "epoch": 0.43123209169054444, "grad_norm": 7.253170967102051, "learning_rate": 3.1740871737133692e-06, "loss": 0.9093, "step": 903 }, { "epoch": 0.43170964660936006, "grad_norm": 5.192101955413818, "learning_rate": 3.1703625426484207e-06, "loss": 2.0191, "step": 904 }, { "epoch": 0.43218720152817575, "grad_norm": 3.968888759613037, "learning_rate": 3.166636307638178e-06, "loss": 1.6076, "step": 905 }, { "epoch": 0.4326647564469914, "grad_norm": 4.7444071769714355, "learning_rate": 3.162908477598232e-06, "loss": 1.6173, "step": 906 }, { "epoch": 0.43314231136580705, "grad_norm": 4.130309581756592, "learning_rate": 3.1591790614479863e-06, "loss": 1.4767, "step": 907 }, { "epoch": 0.43361986628462273, "grad_norm": 4.259106159210205, "learning_rate": 3.1554480681106426e-06, "loss": 1.6746, "step": 908 }, { "epoch": 0.4340974212034384, "grad_norm": 4.322956085205078, "learning_rate": 3.1517155065131753e-06, "loss": 1.3819, "step": 909 }, { "epoch": 0.43457497612225404, "grad_norm": 4.635177135467529, "learning_rate": 3.147981385586311e-06, "loss": 1.8283, "step": 910 }, { "epoch": 0.4350525310410697, "grad_norm": 4.198387145996094, "learning_rate": 3.1442457142645084e-06, "loss": 1.1716, "step": 911 }, { "epoch": 0.4355300859598854, "grad_norm": 4.538154602050781, "learning_rate": 3.1405085014859327e-06, "loss": 1.3413, "step": 912 }, { "epoch": 0.43600764087870103, "grad_norm": 5.3409504890441895, "learning_rate": 3.1367697561924393e-06, "loss": 1.852, "step": 913 }, { "epoch": 0.4364851957975167, "grad_norm": 3.523350954055786, "learning_rate": 3.1330294873295515e-06, "loss": 0.8168, "step": 914 }, { "epoch": 0.4369627507163324, "grad_norm": 4.669760227203369, "learning_rate": 3.129287703846436e-06, "loss": 1.7506, "step": 915 }, { "epoch": 0.437440305635148, "grad_norm": 4.401864528656006, "learning_rate": 3.1255444146958845e-06, "loss": 1.4909, "step": 916 }, { "epoch": 0.4379178605539637, "grad_norm": 5.521142959594727, "learning_rate": 3.1217996288342913e-06, "loss": 1.6165, "step": 917 }, { "epoch": 0.4383954154727794, "grad_norm": 4.72084379196167, "learning_rate": 3.1180533552216297e-06, "loss": 1.1335, "step": 918 }, { "epoch": 0.438872970391595, "grad_norm": 5.006590843200684, "learning_rate": 3.1143056028214362e-06, "loss": 0.9284, "step": 919 }, { "epoch": 0.4393505253104107, "grad_norm": 4.51569128036499, "learning_rate": 3.1105563806007836e-06, "loss": 1.1248, "step": 920 }, { "epoch": 0.43982808022922637, "grad_norm": 4.409367561340332, "learning_rate": 3.1068056975302607e-06, "loss": 0.9386, "step": 921 }, { "epoch": 0.44030563514804205, "grad_norm": 5.646852016448975, "learning_rate": 3.1030535625839537e-06, "loss": 1.2873, "step": 922 }, { "epoch": 0.4407831900668577, "grad_norm": 4.158214092254639, "learning_rate": 3.09929998473942e-06, "loss": 1.4987, "step": 923 }, { "epoch": 0.44126074498567336, "grad_norm": 6.406368732452393, "learning_rate": 3.0955449729776733e-06, "loss": 1.7251, "step": 924 }, { "epoch": 0.44173829990448904, "grad_norm": 4.508574962615967, "learning_rate": 3.0917885362831522e-06, "loss": 2.1424, "step": 925 }, { "epoch": 0.44221585482330467, "grad_norm": 5.70652961730957, "learning_rate": 3.088030683643711e-06, "loss": 1.8378, "step": 926 }, { "epoch": 0.44269340974212035, "grad_norm": 4.864703178405762, "learning_rate": 3.0842714240505877e-06, "loss": 1.3367, "step": 927 }, { "epoch": 0.44317096466093603, "grad_norm": 6.033452033996582, "learning_rate": 3.080510766498389e-06, "loss": 1.6366, "step": 928 }, { "epoch": 0.44364851957975165, "grad_norm": 4.751212120056152, "learning_rate": 3.0767487199850637e-06, "loss": 0.9304, "step": 929 }, { "epoch": 0.44412607449856734, "grad_norm": 4.546492576599121, "learning_rate": 3.072985293511887e-06, "loss": 0.8254, "step": 930 }, { "epoch": 0.444603629417383, "grad_norm": 4.456762313842773, "learning_rate": 3.0692204960834344e-06, "loss": 0.7925, "step": 931 }, { "epoch": 0.44508118433619864, "grad_norm": 5.315690040588379, "learning_rate": 3.065454336707561e-06, "loss": 1.0586, "step": 932 }, { "epoch": 0.4455587392550143, "grad_norm": 5.705597400665283, "learning_rate": 3.061686824395382e-06, "loss": 1.5507, "step": 933 }, { "epoch": 0.44603629417383, "grad_norm": 4.943583965301514, "learning_rate": 3.0579179681612488e-06, "loss": 1.2574, "step": 934 }, { "epoch": 0.44651384909264563, "grad_norm": 5.637134075164795, "learning_rate": 3.054147777022728e-06, "loss": 1.8403, "step": 935 }, { "epoch": 0.4469914040114613, "grad_norm": 4.12382173538208, "learning_rate": 3.050376260000581e-06, "loss": 1.6316, "step": 936 }, { "epoch": 0.447468958930277, "grad_norm": 6.621447563171387, "learning_rate": 3.046603426118741e-06, "loss": 1.3236, "step": 937 }, { "epoch": 0.4479465138490926, "grad_norm": 5.565084934234619, "learning_rate": 3.0428292844042933e-06, "loss": 1.3567, "step": 938 }, { "epoch": 0.4484240687679083, "grad_norm": 4.276286602020264, "learning_rate": 3.03905384388745e-06, "loss": 1.3226, "step": 939 }, { "epoch": 0.448901623686724, "grad_norm": 5.081269264221191, "learning_rate": 3.0352771136015326e-06, "loss": 1.6679, "step": 940 }, { "epoch": 0.44937917860553966, "grad_norm": 4.747688293457031, "learning_rate": 3.031499102582949e-06, "loss": 2.5894, "step": 941 }, { "epoch": 0.4498567335243553, "grad_norm": 5.120903968811035, "learning_rate": 3.027719819871169e-06, "loss": 1.5493, "step": 942 }, { "epoch": 0.45033428844317097, "grad_norm": 5.915216445922852, "learning_rate": 3.0239392745087076e-06, "loss": 1.3919, "step": 943 }, { "epoch": 0.45081184336198665, "grad_norm": 4.707735061645508, "learning_rate": 3.0201574755410994e-06, "loss": 1.2639, "step": 944 }, { "epoch": 0.4512893982808023, "grad_norm": 4.528112888336182, "learning_rate": 3.0163744320168797e-06, "loss": 1.3542, "step": 945 }, { "epoch": 0.45176695319961796, "grad_norm": 4.710062026977539, "learning_rate": 3.0125901529875612e-06, "loss": 1.2432, "step": 946 }, { "epoch": 0.45224450811843364, "grad_norm": 3.8418147563934326, "learning_rate": 3.0088046475076107e-06, "loss": 1.1821, "step": 947 }, { "epoch": 0.45272206303724927, "grad_norm": 5.175236701965332, "learning_rate": 3.0050179246344326e-06, "loss": 1.5989, "step": 948 }, { "epoch": 0.45319961795606495, "grad_norm": 4.053905010223389, "learning_rate": 3.0012299934283425e-06, "loss": 1.319, "step": 949 }, { "epoch": 0.45367717287488063, "grad_norm": 5.412647724151611, "learning_rate": 2.997440862952547e-06, "loss": 1.7205, "step": 950 }, { "epoch": 0.45415472779369626, "grad_norm": 4.9282450675964355, "learning_rate": 2.993650542273122e-06, "loss": 1.1365, "step": 951 }, { "epoch": 0.45463228271251194, "grad_norm": 5.109230041503906, "learning_rate": 2.9898590404589928e-06, "loss": 1.1473, "step": 952 }, { "epoch": 0.4551098376313276, "grad_norm": 6.320009708404541, "learning_rate": 2.986066366581909e-06, "loss": 1.5633, "step": 953 }, { "epoch": 0.45558739255014324, "grad_norm": 4.73321008682251, "learning_rate": 2.9822725297164247e-06, "loss": 1.2317, "step": 954 }, { "epoch": 0.4560649474689589, "grad_norm": 4.870784759521484, "learning_rate": 2.9784775389398775e-06, "loss": 0.9768, "step": 955 }, { "epoch": 0.4565425023877746, "grad_norm": 4.323260307312012, "learning_rate": 2.974681403332365e-06, "loss": 1.0156, "step": 956 }, { "epoch": 0.45702005730659023, "grad_norm": 4.36177396774292, "learning_rate": 2.9708841319767246e-06, "loss": 1.7319, "step": 957 }, { "epoch": 0.4574976122254059, "grad_norm": 4.102828502655029, "learning_rate": 2.9670857339585107e-06, "loss": 1.8335, "step": 958 }, { "epoch": 0.4579751671442216, "grad_norm": 4.595668315887451, "learning_rate": 2.9632862183659744e-06, "loss": 1.0344, "step": 959 }, { "epoch": 0.4584527220630373, "grad_norm": 4.940067291259766, "learning_rate": 2.9594855942900396e-06, "loss": 1.1255, "step": 960 }, { "epoch": 0.4589302769818529, "grad_norm": 4.949899196624756, "learning_rate": 2.9556838708242826e-06, "loss": 1.4373, "step": 961 }, { "epoch": 0.4594078319006686, "grad_norm": 5.07224178314209, "learning_rate": 2.9518810570649105e-06, "loss": 1.4672, "step": 962 }, { "epoch": 0.45988538681948427, "grad_norm": 4.689038276672363, "learning_rate": 2.9480771621107395e-06, "loss": 0.8665, "step": 963 }, { "epoch": 0.4603629417382999, "grad_norm": 3.6448285579681396, "learning_rate": 2.944272195063172e-06, "loss": 0.7276, "step": 964 }, { "epoch": 0.4608404966571156, "grad_norm": 5.058627128601074, "learning_rate": 2.940466165026175e-06, "loss": 1.1204, "step": 965 }, { "epoch": 0.46131805157593125, "grad_norm": 6.425746440887451, "learning_rate": 2.9366590811062606e-06, "loss": 1.6774, "step": 966 }, { "epoch": 0.4617956064947469, "grad_norm": 4.3713555335998535, "learning_rate": 2.9328509524124617e-06, "loss": 1.4857, "step": 967 }, { "epoch": 0.46227316141356256, "grad_norm": 6.084547996520996, "learning_rate": 2.9290417880563097e-06, "loss": 1.7039, "step": 968 }, { "epoch": 0.46275071633237824, "grad_norm": 5.24597692489624, "learning_rate": 2.9252315971518164e-06, "loss": 0.9104, "step": 969 }, { "epoch": 0.46322827125119387, "grad_norm": 4.375820636749268, "learning_rate": 2.921420388815447e-06, "loss": 1.8744, "step": 970 }, { "epoch": 0.46370582617000955, "grad_norm": 4.414572238922119, "learning_rate": 2.917608172166104e-06, "loss": 1.3884, "step": 971 }, { "epoch": 0.46418338108882523, "grad_norm": 4.380600929260254, "learning_rate": 2.9137949563251e-06, "loss": 1.5092, "step": 972 }, { "epoch": 0.46466093600764086, "grad_norm": 5.010895729064941, "learning_rate": 2.909980750416139e-06, "loss": 2.0463, "step": 973 }, { "epoch": 0.46513849092645654, "grad_norm": 4.654458045959473, "learning_rate": 2.9061655635652953e-06, "loss": 2.2859, "step": 974 }, { "epoch": 0.4656160458452722, "grad_norm": 6.0710554122924805, "learning_rate": 2.9023494049009883e-06, "loss": 1.7849, "step": 975 }, { "epoch": 0.46609360076408785, "grad_norm": 4.992781162261963, "learning_rate": 2.898532283553963e-06, "loss": 1.2476, "step": 976 }, { "epoch": 0.4665711556829035, "grad_norm": 5.010497570037842, "learning_rate": 2.89471420865727e-06, "loss": 1.0241, "step": 977 }, { "epoch": 0.4670487106017192, "grad_norm": 6.075325012207031, "learning_rate": 2.890895189346238e-06, "loss": 1.1073, "step": 978 }, { "epoch": 0.46752626552053483, "grad_norm": 6.099752902984619, "learning_rate": 2.8870752347584575e-06, "loss": 1.3662, "step": 979 }, { "epoch": 0.4680038204393505, "grad_norm": 3.921536445617676, "learning_rate": 2.883254354033756e-06, "loss": 1.3209, "step": 980 }, { "epoch": 0.4684813753581662, "grad_norm": 5.05742073059082, "learning_rate": 2.879432556314178e-06, "loss": 0.9288, "step": 981 }, { "epoch": 0.4689589302769819, "grad_norm": 4.801120758056641, "learning_rate": 2.8756098507439605e-06, "loss": 1.0922, "step": 982 }, { "epoch": 0.4694364851957975, "grad_norm": 4.358757495880127, "learning_rate": 2.871786246469513e-06, "loss": 1.6015, "step": 983 }, { "epoch": 0.4699140401146132, "grad_norm": 4.864490985870361, "learning_rate": 2.8679617526393964e-06, "loss": 1.4404, "step": 984 }, { "epoch": 0.47039159503342887, "grad_norm": 4.748241901397705, "learning_rate": 2.8641363784042997e-06, "loss": 1.7622, "step": 985 }, { "epoch": 0.4708691499522445, "grad_norm": 4.356468200683594, "learning_rate": 2.8603101329170172e-06, "loss": 1.5835, "step": 986 }, { "epoch": 0.4713467048710602, "grad_norm": 4.282278060913086, "learning_rate": 2.8564830253324293e-06, "loss": 0.8588, "step": 987 }, { "epoch": 0.47182425978987586, "grad_norm": 4.269670009613037, "learning_rate": 2.8526550648074776e-06, "loss": 1.4547, "step": 988 }, { "epoch": 0.4723018147086915, "grad_norm": 4.8129563331604, "learning_rate": 2.848826260501146e-06, "loss": 1.3983, "step": 989 }, { "epoch": 0.47277936962750716, "grad_norm": 4.15216588973999, "learning_rate": 2.8449966215744364e-06, "loss": 0.868, "step": 990 }, { "epoch": 0.47325692454632284, "grad_norm": 4.738534450531006, "learning_rate": 2.8411661571903477e-06, "loss": 1.8281, "step": 991 }, { "epoch": 0.47373447946513847, "grad_norm": 3.8670248985290527, "learning_rate": 2.8373348765138536e-06, "loss": 2.0575, "step": 992 }, { "epoch": 0.47421203438395415, "grad_norm": 4.588948726654053, "learning_rate": 2.833502788711882e-06, "loss": 2.024, "step": 993 }, { "epoch": 0.47468958930276983, "grad_norm": 4.513068675994873, "learning_rate": 2.8296699029532916e-06, "loss": 1.564, "step": 994 }, { "epoch": 0.47516714422158546, "grad_norm": 4.971329212188721, "learning_rate": 2.8258362284088482e-06, "loss": 2.1918, "step": 995 }, { "epoch": 0.47564469914040114, "grad_norm": 4.775160789489746, "learning_rate": 2.8220017742512085e-06, "loss": 1.2999, "step": 996 }, { "epoch": 0.4761222540592168, "grad_norm": 4.087936878204346, "learning_rate": 2.8181665496548916e-06, "loss": 0.8169, "step": 997 }, { "epoch": 0.47659980897803245, "grad_norm": 4.190609455108643, "learning_rate": 2.8143305637962613e-06, "loss": 1.0541, "step": 998 }, { "epoch": 0.47707736389684813, "grad_norm": 4.592844009399414, "learning_rate": 2.8104938258535034e-06, "loss": 1.6884, "step": 999 }, { "epoch": 0.4775549188156638, "grad_norm": 4.567158222198486, "learning_rate": 2.806656345006602e-06, "loss": 1.4927, "step": 1000 }, { "epoch": 0.4780324737344795, "grad_norm": 4.0273284912109375, "learning_rate": 2.8028181304373184e-06, "loss": 1.2577, "step": 1001 }, { "epoch": 0.4785100286532951, "grad_norm": 4.077206611633301, "learning_rate": 2.798979191329171e-06, "loss": 1.0689, "step": 1002 }, { "epoch": 0.4789875835721108, "grad_norm": 4.8535637855529785, "learning_rate": 2.7951395368674113e-06, "loss": 0.909, "step": 1003 }, { "epoch": 0.4794651384909265, "grad_norm": 5.24513053894043, "learning_rate": 2.7912991762390008e-06, "loss": 1.5262, "step": 1004 }, { "epoch": 0.4799426934097421, "grad_norm": 5.03757905960083, "learning_rate": 2.787458118632593e-06, "loss": 1.9559, "step": 1005 }, { "epoch": 0.4804202483285578, "grad_norm": 3.946176052093506, "learning_rate": 2.783616373238507e-06, "loss": 1.2584, "step": 1006 }, { "epoch": 0.48089780324737347, "grad_norm": 5.019138336181641, "learning_rate": 2.7797739492487095e-06, "loss": 1.4913, "step": 1007 }, { "epoch": 0.4813753581661891, "grad_norm": 4.4684343338012695, "learning_rate": 2.775930855856789e-06, "loss": 1.3, "step": 1008 }, { "epoch": 0.4818529130850048, "grad_norm": 4.79974889755249, "learning_rate": 2.772087102257936e-06, "loss": 1.4015, "step": 1009 }, { "epoch": 0.48233046800382046, "grad_norm": 5.147920608520508, "learning_rate": 2.768242697648922e-06, "loss": 1.1726, "step": 1010 }, { "epoch": 0.4828080229226361, "grad_norm": 6.019247531890869, "learning_rate": 2.764397651228074e-06, "loss": 1.2238, "step": 1011 }, { "epoch": 0.48328557784145176, "grad_norm": 4.452391147613525, "learning_rate": 2.7605519721952578e-06, "loss": 1.4037, "step": 1012 }, { "epoch": 0.48376313276026744, "grad_norm": 3.3110523223876953, "learning_rate": 2.7567056697518492e-06, "loss": 0.8822, "step": 1013 }, { "epoch": 0.48424068767908307, "grad_norm": 4.637045860290527, "learning_rate": 2.7528587531007174e-06, "loss": 1.7008, "step": 1014 }, { "epoch": 0.48471824259789875, "grad_norm": 4.88334321975708, "learning_rate": 2.7490112314462013e-06, "loss": 1.8316, "step": 1015 }, { "epoch": 0.48519579751671443, "grad_norm": 4.124886512756348, "learning_rate": 2.7451631139940875e-06, "loss": 0.8008, "step": 1016 }, { "epoch": 0.48567335243553006, "grad_norm": 5.581754684448242, "learning_rate": 2.7413144099515877e-06, "loss": 1.0915, "step": 1017 }, { "epoch": 0.48615090735434574, "grad_norm": 4.317714691162109, "learning_rate": 2.7374651285273163e-06, "loss": 1.3004, "step": 1018 }, { "epoch": 0.4866284622731614, "grad_norm": 3.7460813522338867, "learning_rate": 2.7336152789312705e-06, "loss": 0.9708, "step": 1019 }, { "epoch": 0.4871060171919771, "grad_norm": 4.116492748260498, "learning_rate": 2.7297648703748065e-06, "loss": 1.6836, "step": 1020 }, { "epoch": 0.48758357211079273, "grad_norm": 3.742171049118042, "learning_rate": 2.725913912070619e-06, "loss": 1.2743, "step": 1021 }, { "epoch": 0.4880611270296084, "grad_norm": 4.056023597717285, "learning_rate": 2.722062413232715e-06, "loss": 1.0621, "step": 1022 }, { "epoch": 0.4885386819484241, "grad_norm": 4.263254642486572, "learning_rate": 2.7182103830763974e-06, "loss": 1.8079, "step": 1023 }, { "epoch": 0.4890162368672397, "grad_norm": 5.910298824310303, "learning_rate": 2.71435783081824e-06, "loss": 1.6519, "step": 1024 }, { "epoch": 0.4894937917860554, "grad_norm": 4.179514408111572, "learning_rate": 2.7105047656760657e-06, "loss": 1.3482, "step": 1025 }, { "epoch": 0.4899713467048711, "grad_norm": 5.1134161949157715, "learning_rate": 2.7066511968689235e-06, "loss": 1.222, "step": 1026 }, { "epoch": 0.4904489016236867, "grad_norm": 5.077592849731445, "learning_rate": 2.702797133617069e-06, "loss": 1.6934, "step": 1027 }, { "epoch": 0.4909264565425024, "grad_norm": 4.779074668884277, "learning_rate": 2.6989425851419398e-06, "loss": 1.0082, "step": 1028 }, { "epoch": 0.49140401146131807, "grad_norm": 4.059503555297852, "learning_rate": 2.6950875606661364e-06, "loss": 1.1368, "step": 1029 }, { "epoch": 0.4918815663801337, "grad_norm": 4.298985481262207, "learning_rate": 2.6912320694133944e-06, "loss": 0.8493, "step": 1030 }, { "epoch": 0.4923591212989494, "grad_norm": 5.132499694824219, "learning_rate": 2.687376120608569e-06, "loss": 1.6171, "step": 1031 }, { "epoch": 0.49283667621776506, "grad_norm": 4.960550785064697, "learning_rate": 2.6835197234776113e-06, "loss": 1.5277, "step": 1032 }, { "epoch": 0.4933142311365807, "grad_norm": 5.0114006996154785, "learning_rate": 2.6796628872475412e-06, "loss": 1.5599, "step": 1033 }, { "epoch": 0.49379178605539636, "grad_norm": 4.234452247619629, "learning_rate": 2.675805621146433e-06, "loss": 1.6473, "step": 1034 }, { "epoch": 0.49426934097421205, "grad_norm": 5.055663585662842, "learning_rate": 2.671947934403388e-06, "loss": 1.1382, "step": 1035 }, { "epoch": 0.49474689589302767, "grad_norm": 3.8857522010803223, "learning_rate": 2.6680898362485126e-06, "loss": 1.3692, "step": 1036 }, { "epoch": 0.49522445081184335, "grad_norm": 4.141610622406006, "learning_rate": 2.6642313359129e-06, "loss": 1.5766, "step": 1037 }, { "epoch": 0.49570200573065903, "grad_norm": 4.909717082977295, "learning_rate": 2.660372442628605e-06, "loss": 0.7883, "step": 1038 }, { "epoch": 0.4961795606494747, "grad_norm": 5.041119575500488, "learning_rate": 2.656513165628621e-06, "loss": 2.0262, "step": 1039 }, { "epoch": 0.49665711556829034, "grad_norm": 5.168432235717773, "learning_rate": 2.6526535141468613e-06, "loss": 1.3321, "step": 1040 }, { "epoch": 0.497134670487106, "grad_norm": 4.450003623962402, "learning_rate": 2.648793497418134e-06, "loss": 1.2382, "step": 1041 }, { "epoch": 0.4976122254059217, "grad_norm": 4.760186672210693, "learning_rate": 2.6449331246781225e-06, "loss": 0.935, "step": 1042 }, { "epoch": 0.49808978032473733, "grad_norm": 4.0534515380859375, "learning_rate": 2.641072405163362e-06, "loss": 0.7648, "step": 1043 }, { "epoch": 0.498567335243553, "grad_norm": 4.486025333404541, "learning_rate": 2.6372113481112137e-06, "loss": 1.4443, "step": 1044 }, { "epoch": 0.4990448901623687, "grad_norm": 5.917057991027832, "learning_rate": 2.6333499627598525e-06, "loss": 1.7969, "step": 1045 }, { "epoch": 0.4995224450811843, "grad_norm": 4.415359973907471, "learning_rate": 2.6294882583482334e-06, "loss": 1.0343, "step": 1046 }, { "epoch": 0.5, "grad_norm": 4.28107213973999, "learning_rate": 2.6256262441160785e-06, "loss": 1.2794, "step": 1047 }, { "epoch": 0.5004775549188156, "grad_norm": 4.585839748382568, "learning_rate": 2.6217639293038485e-06, "loss": 0.8883, "step": 1048 }, { "epoch": 0.5009551098376314, "grad_norm": 5.603163242340088, "learning_rate": 2.617901323152725e-06, "loss": 1.123, "step": 1049 }, { "epoch": 0.501432664756447, "grad_norm": 5.3175177574157715, "learning_rate": 2.6140384349045865e-06, "loss": 1.8189, "step": 1050 }, { "epoch": 0.5019102196752626, "grad_norm": 5.157625198364258, "learning_rate": 2.610175273801987e-06, "loss": 1.753, "step": 1051 }, { "epoch": 0.5023877745940784, "grad_norm": 4.299335479736328, "learning_rate": 2.6063118490881304e-06, "loss": 1.9252, "step": 1052 }, { "epoch": 0.502865329512894, "grad_norm": 4.148587226867676, "learning_rate": 2.602448170006855e-06, "loss": 1.1964, "step": 1053 }, { "epoch": 0.5033428844317096, "grad_norm": 4.7903265953063965, "learning_rate": 2.598584245802605e-06, "loss": 1.8395, "step": 1054 }, { "epoch": 0.5038204393505253, "grad_norm": 5.174732208251953, "learning_rate": 2.594720085720413e-06, "loss": 1.1904, "step": 1055 }, { "epoch": 0.504297994269341, "grad_norm": 4.260627746582031, "learning_rate": 2.590855699005876e-06, "loss": 1.4144, "step": 1056 }, { "epoch": 0.5047755491881566, "grad_norm": 4.403366565704346, "learning_rate": 2.58699109490513e-06, "loss": 1.4664, "step": 1057 }, { "epoch": 0.5052531041069723, "grad_norm": 5.070522308349609, "learning_rate": 2.5831262826648363e-06, "loss": 1.6618, "step": 1058 }, { "epoch": 0.505730659025788, "grad_norm": 4.575981140136719, "learning_rate": 2.579261271532149e-06, "loss": 1.6929, "step": 1059 }, { "epoch": 0.5062082139446036, "grad_norm": 4.91890287399292, "learning_rate": 2.5753960707547033e-06, "loss": 1.3768, "step": 1060 }, { "epoch": 0.5066857688634193, "grad_norm": 4.586157321929932, "learning_rate": 2.571530689580583e-06, "loss": 1.3948, "step": 1061 }, { "epoch": 0.5071633237822349, "grad_norm": 5.028755187988281, "learning_rate": 2.5676651372583066e-06, "loss": 1.5718, "step": 1062 }, { "epoch": 0.5076408787010506, "grad_norm": 5.1034440994262695, "learning_rate": 2.5637994230368024e-06, "loss": 1.4729, "step": 1063 }, { "epoch": 0.5081184336198663, "grad_norm": 4.846216678619385, "learning_rate": 2.5599335561653845e-06, "loss": 1.6073, "step": 1064 }, { "epoch": 0.5085959885386819, "grad_norm": 4.283311367034912, "learning_rate": 2.5560675458937323e-06, "loss": 0.7225, "step": 1065 }, { "epoch": 0.5090735434574976, "grad_norm": 5.427924633026123, "learning_rate": 2.55220140147187e-06, "loss": 1.2206, "step": 1066 }, { "epoch": 0.5095510983763133, "grad_norm": 5.178221702575684, "learning_rate": 2.5483351321501406e-06, "loss": 1.3681, "step": 1067 }, { "epoch": 0.5100286532951289, "grad_norm": 4.720668315887451, "learning_rate": 2.5444687471791866e-06, "loss": 0.9615, "step": 1068 }, { "epoch": 0.5105062082139447, "grad_norm": 5.0654449462890625, "learning_rate": 2.5406022558099296e-06, "loss": 1.0877, "step": 1069 }, { "epoch": 0.5109837631327603, "grad_norm": 4.396979331970215, "learning_rate": 2.536735667293542e-06, "loss": 1.3921, "step": 1070 }, { "epoch": 0.5114613180515759, "grad_norm": 5.217238903045654, "learning_rate": 2.532868990881431e-06, "loss": 1.5044, "step": 1071 }, { "epoch": 0.5119388729703916, "grad_norm": 4.324419021606445, "learning_rate": 2.529002235825213e-06, "loss": 1.0591, "step": 1072 }, { "epoch": 0.5124164278892073, "grad_norm": 4.945492267608643, "learning_rate": 2.525135411376695e-06, "loss": 1.7044, "step": 1073 }, { "epoch": 0.5128939828080229, "grad_norm": 5.012969017028809, "learning_rate": 2.5212685267878455e-06, "loss": 1.4327, "step": 1074 }, { "epoch": 0.5133715377268386, "grad_norm": 4.533985614776611, "learning_rate": 2.517401591310781e-06, "loss": 1.8558, "step": 1075 }, { "epoch": 0.5138490926456543, "grad_norm": 5.153561592102051, "learning_rate": 2.5135346141977374e-06, "loss": 1.3852, "step": 1076 }, { "epoch": 0.5143266475644699, "grad_norm": 4.855926036834717, "learning_rate": 2.5096676047010516e-06, "loss": 1.1959, "step": 1077 }, { "epoch": 0.5148042024832856, "grad_norm": 3.8854408264160156, "learning_rate": 2.505800572073138e-06, "loss": 1.4409, "step": 1078 }, { "epoch": 0.5152817574021012, "grad_norm": 3.5710055828094482, "learning_rate": 2.5019335255664652e-06, "loss": 1.6838, "step": 1079 }, { "epoch": 0.5157593123209169, "grad_norm": 4.385650157928467, "learning_rate": 2.4980664744335348e-06, "loss": 2.0207, "step": 1080 }, { "epoch": 0.5162368672397326, "grad_norm": 4.240507125854492, "learning_rate": 2.4941994279268623e-06, "loss": 0.8972, "step": 1081 }, { "epoch": 0.5167144221585482, "grad_norm": 4.353018760681152, "learning_rate": 2.4903323952989484e-06, "loss": 0.9939, "step": 1082 }, { "epoch": 0.5171919770773639, "grad_norm": 5.4251604080200195, "learning_rate": 2.486465385802263e-06, "loss": 1.307, "step": 1083 }, { "epoch": 0.5176695319961796, "grad_norm": 4.621923923492432, "learning_rate": 2.482598408689219e-06, "loss": 1.929, "step": 1084 }, { "epoch": 0.5181470869149952, "grad_norm": 4.586228847503662, "learning_rate": 2.478731473212155e-06, "loss": 1.5948, "step": 1085 }, { "epoch": 0.5186246418338109, "grad_norm": 4.603383541107178, "learning_rate": 2.474864588623306e-06, "loss": 1.4012, "step": 1086 }, { "epoch": 0.5191021967526266, "grad_norm": 4.191261291503906, "learning_rate": 2.4709977641747866e-06, "loss": 1.3062, "step": 1087 }, { "epoch": 0.5195797516714422, "grad_norm": 4.421393871307373, "learning_rate": 2.467131009118569e-06, "loss": 1.7982, "step": 1088 }, { "epoch": 0.5200573065902578, "grad_norm": 3.933893918991089, "learning_rate": 2.463264332706458e-06, "loss": 0.7916, "step": 1089 }, { "epoch": 0.5205348615090736, "grad_norm": 5.0850067138671875, "learning_rate": 2.4593977441900708e-06, "loss": 1.2468, "step": 1090 }, { "epoch": 0.5210124164278892, "grad_norm": 5.184426784515381, "learning_rate": 2.455531252820814e-06, "loss": 1.0435, "step": 1091 }, { "epoch": 0.5214899713467048, "grad_norm": 4.359632968902588, "learning_rate": 2.4516648678498607e-06, "loss": 1.0088, "step": 1092 }, { "epoch": 0.5219675262655206, "grad_norm": 4.951732158660889, "learning_rate": 2.4477985985281313e-06, "loss": 1.4142, "step": 1093 }, { "epoch": 0.5224450811843362, "grad_norm": 5.167581558227539, "learning_rate": 2.443932454106268e-06, "loss": 1.1492, "step": 1094 }, { "epoch": 0.5229226361031518, "grad_norm": 5.029867172241211, "learning_rate": 2.440066443834617e-06, "loss": 1.7526, "step": 1095 }, { "epoch": 0.5234001910219676, "grad_norm": 4.586050033569336, "learning_rate": 2.4362005769631985e-06, "loss": 1.9087, "step": 1096 }, { "epoch": 0.5238777459407832, "grad_norm": 4.030465126037598, "learning_rate": 2.432334862741694e-06, "loss": 0.8227, "step": 1097 }, { "epoch": 0.5243553008595988, "grad_norm": 4.2692484855651855, "learning_rate": 2.4284693104194177e-06, "loss": 1.3446, "step": 1098 }, { "epoch": 0.5248328557784145, "grad_norm": 4.785492897033691, "learning_rate": 2.4246039292452976e-06, "loss": 1.3247, "step": 1099 }, { "epoch": 0.5253104106972302, "grad_norm": 6.068233966827393, "learning_rate": 2.4207387284678513e-06, "loss": 1.928, "step": 1100 }, { "epoch": 0.5257879656160458, "grad_norm": 4.63082218170166, "learning_rate": 2.4168737173351645e-06, "loss": 1.08, "step": 1101 }, { "epoch": 0.5262655205348615, "grad_norm": 4.226769924163818, "learning_rate": 2.4130089050948703e-06, "loss": 1.2445, "step": 1102 }, { "epoch": 0.5267430754536772, "grad_norm": 4.093264102935791, "learning_rate": 2.409144300994125e-06, "loss": 1.8692, "step": 1103 }, { "epoch": 0.5272206303724928, "grad_norm": 4.954926013946533, "learning_rate": 2.4052799142795872e-06, "loss": 1.9627, "step": 1104 }, { "epoch": 0.5276981852913085, "grad_norm": 4.757976531982422, "learning_rate": 2.4014157541973957e-06, "loss": 1.4257, "step": 1105 }, { "epoch": 0.5281757402101241, "grad_norm": 5.250477313995361, "learning_rate": 2.397551829993146e-06, "loss": 1.5215, "step": 1106 }, { "epoch": 0.5286532951289399, "grad_norm": 4.514709949493408, "learning_rate": 2.39368815091187e-06, "loss": 1.5097, "step": 1107 }, { "epoch": 0.5291308500477555, "grad_norm": 4.599431037902832, "learning_rate": 2.3898247261980135e-06, "loss": 1.9925, "step": 1108 }, { "epoch": 0.5296084049665711, "grad_norm": 5.778361797332764, "learning_rate": 2.385961565095414e-06, "loss": 1.4159, "step": 1109 }, { "epoch": 0.5300859598853869, "grad_norm": 6.3022141456604, "learning_rate": 2.3820986768472755e-06, "loss": 1.101, "step": 1110 }, { "epoch": 0.5305635148042025, "grad_norm": 4.897792339324951, "learning_rate": 2.3782360706961524e-06, "loss": 1.5906, "step": 1111 }, { "epoch": 0.5310410697230181, "grad_norm": 5.163201808929443, "learning_rate": 2.3743737558839228e-06, "loss": 1.6387, "step": 1112 }, { "epoch": 0.5315186246418339, "grad_norm": 6.004908084869385, "learning_rate": 2.370511741651768e-06, "loss": 1.439, "step": 1113 }, { "epoch": 0.5319961795606495, "grad_norm": 5.200117588043213, "learning_rate": 2.366650037240149e-06, "loss": 1.1796, "step": 1114 }, { "epoch": 0.5324737344794651, "grad_norm": 4.24569034576416, "learning_rate": 2.362788651888787e-06, "loss": 1.393, "step": 1115 }, { "epoch": 0.5329512893982808, "grad_norm": 5.275874614715576, "learning_rate": 2.3589275948366398e-06, "loss": 1.0721, "step": 1116 }, { "epoch": 0.5334288443170965, "grad_norm": 5.757937431335449, "learning_rate": 2.355066875321878e-06, "loss": 0.9639, "step": 1117 }, { "epoch": 0.5339063992359121, "grad_norm": 4.6624579429626465, "learning_rate": 2.3512065025818673e-06, "loss": 2.0701, "step": 1118 }, { "epoch": 0.5343839541547278, "grad_norm": 5.042705535888672, "learning_rate": 2.3473464858531404e-06, "loss": 1.6118, "step": 1119 }, { "epoch": 0.5348615090735435, "grad_norm": 4.858127117156982, "learning_rate": 2.3434868343713803e-06, "loss": 1.6232, "step": 1120 }, { "epoch": 0.5353390639923591, "grad_norm": 4.0244951248168945, "learning_rate": 2.3396275573713963e-06, "loss": 0.9374, "step": 1121 }, { "epoch": 0.5358166189111748, "grad_norm": 4.338305950164795, "learning_rate": 2.335768664087101e-06, "loss": 1.2688, "step": 1122 }, { "epoch": 0.5362941738299905, "grad_norm": 4.010972023010254, "learning_rate": 2.331910163751488e-06, "loss": 1.7809, "step": 1123 }, { "epoch": 0.5367717287488061, "grad_norm": 4.21201753616333, "learning_rate": 2.3280520655966126e-06, "loss": 1.4509, "step": 1124 }, { "epoch": 0.5372492836676218, "grad_norm": 4.408762454986572, "learning_rate": 2.3241943788535672e-06, "loss": 1.7222, "step": 1125 }, { "epoch": 0.5377268385864374, "grad_norm": 5.495283603668213, "learning_rate": 2.320337112752459e-06, "loss": 1.9941, "step": 1126 }, { "epoch": 0.5382043935052531, "grad_norm": 4.543834686279297, "learning_rate": 2.316480276522389e-06, "loss": 0.8873, "step": 1127 }, { "epoch": 0.5386819484240688, "grad_norm": 5.45999002456665, "learning_rate": 2.312623879391431e-06, "loss": 1.9292, "step": 1128 }, { "epoch": 0.5391595033428844, "grad_norm": 5.472384452819824, "learning_rate": 2.308767930586606e-06, "loss": 1.0915, "step": 1129 }, { "epoch": 0.5396370582617, "grad_norm": 5.497647285461426, "learning_rate": 2.3049124393338645e-06, "loss": 1.0867, "step": 1130 }, { "epoch": 0.5401146131805158, "grad_norm": 4.853126525878906, "learning_rate": 2.30105741485806e-06, "loss": 1.8714, "step": 1131 }, { "epoch": 0.5405921680993314, "grad_norm": 4.491593360900879, "learning_rate": 2.297202866382931e-06, "loss": 1.356, "step": 1132 }, { "epoch": 0.541069723018147, "grad_norm": 5.419744491577148, "learning_rate": 2.293348803131077e-06, "loss": 1.0532, "step": 1133 }, { "epoch": 0.5415472779369628, "grad_norm": 4.356868267059326, "learning_rate": 2.2894952343239348e-06, "loss": 1.0018, "step": 1134 }, { "epoch": 0.5420248328557784, "grad_norm": 4.170576572418213, "learning_rate": 2.2856421691817607e-06, "loss": 1.3293, "step": 1135 }, { "epoch": 0.542502387774594, "grad_norm": 4.040979862213135, "learning_rate": 2.2817896169236034e-06, "loss": 1.5417, "step": 1136 }, { "epoch": 0.5429799426934098, "grad_norm": 3.7478909492492676, "learning_rate": 2.277937586767286e-06, "loss": 1.6721, "step": 1137 }, { "epoch": 0.5434574976122254, "grad_norm": 5.454432964324951, "learning_rate": 2.274086087929382e-06, "loss": 1.3994, "step": 1138 }, { "epoch": 0.543935052531041, "grad_norm": 3.8435275554656982, "learning_rate": 2.270235129625194e-06, "loss": 1.337, "step": 1139 }, { "epoch": 0.5444126074498568, "grad_norm": 4.363958358764648, "learning_rate": 2.2663847210687303e-06, "loss": 0.9663, "step": 1140 }, { "epoch": 0.5448901623686724, "grad_norm": 4.643497943878174, "learning_rate": 2.2625348714726845e-06, "loss": 0.8755, "step": 1141 }, { "epoch": 0.545367717287488, "grad_norm": 4.340799331665039, "learning_rate": 2.2586855900484135e-06, "loss": 1.8005, "step": 1142 }, { "epoch": 0.5458452722063037, "grad_norm": 4.446901321411133, "learning_rate": 2.254836886005913e-06, "loss": 1.6003, "step": 1143 }, { "epoch": 0.5463228271251194, "grad_norm": 4.112783908843994, "learning_rate": 2.2509887685537995e-06, "loss": 0.9066, "step": 1144 }, { "epoch": 0.5468003820439351, "grad_norm": 4.387242794036865, "learning_rate": 2.247141246899284e-06, "loss": 1.5552, "step": 1145 }, { "epoch": 0.5472779369627507, "grad_norm": 4.13114070892334, "learning_rate": 2.243294330248152e-06, "loss": 1.5193, "step": 1146 }, { "epoch": 0.5477554918815664, "grad_norm": 4.4933576583862305, "learning_rate": 2.2394480278047427e-06, "loss": 0.7264, "step": 1147 }, { "epoch": 0.5482330468003821, "grad_norm": 4.49673318862915, "learning_rate": 2.235602348771927e-06, "loss": 1.9931, "step": 1148 }, { "epoch": 0.5487106017191977, "grad_norm": 5.080113887786865, "learning_rate": 2.231757302351079e-06, "loss": 1.6042, "step": 1149 }, { "epoch": 0.5491881566380133, "grad_norm": 5.514265537261963, "learning_rate": 2.2279128977420648e-06, "loss": 1.5831, "step": 1150 }, { "epoch": 0.5496657115568291, "grad_norm": 5.287341117858887, "learning_rate": 2.224069144143212e-06, "loss": 1.5179, "step": 1151 }, { "epoch": 0.5501432664756447, "grad_norm": 5.454514503479004, "learning_rate": 2.220226050751291e-06, "loss": 0.9131, "step": 1152 }, { "epoch": 0.5506208213944603, "grad_norm": 4.761523723602295, "learning_rate": 2.2163836267614934e-06, "loss": 0.6939, "step": 1153 }, { "epoch": 0.5510983763132761, "grad_norm": 3.781573534011841, "learning_rate": 2.2125418813674075e-06, "loss": 1.3047, "step": 1154 }, { "epoch": 0.5515759312320917, "grad_norm": 4.1478118896484375, "learning_rate": 2.2087008237609996e-06, "loss": 1.4122, "step": 1155 }, { "epoch": 0.5520534861509073, "grad_norm": 4.899458408355713, "learning_rate": 2.2048604631325896e-06, "loss": 1.8022, "step": 1156 }, { "epoch": 0.5525310410697231, "grad_norm": 4.4767279624938965, "learning_rate": 2.20102080867083e-06, "loss": 1.714, "step": 1157 }, { "epoch": 0.5530085959885387, "grad_norm": 4.596755504608154, "learning_rate": 2.1971818695626824e-06, "loss": 2.0827, "step": 1158 }, { "epoch": 0.5534861509073543, "grad_norm": 4.246908664703369, "learning_rate": 2.1933436549933994e-06, "loss": 1.3543, "step": 1159 }, { "epoch": 0.55396370582617, "grad_norm": 5.160912036895752, "learning_rate": 2.1895061741464974e-06, "loss": 1.0879, "step": 1160 }, { "epoch": 0.5544412607449857, "grad_norm": 4.0639519691467285, "learning_rate": 2.185669436203739e-06, "loss": 1.4978, "step": 1161 }, { "epoch": 0.5549188156638013, "grad_norm": 4.522521495819092, "learning_rate": 2.18183345034511e-06, "loss": 1.3616, "step": 1162 }, { "epoch": 0.555396370582617, "grad_norm": 4.644901275634766, "learning_rate": 2.177998225748793e-06, "loss": 1.3568, "step": 1163 }, { "epoch": 0.5558739255014327, "grad_norm": 5.785778045654297, "learning_rate": 2.174163771591153e-06, "loss": 1.814, "step": 1164 }, { "epoch": 0.5563514804202483, "grad_norm": 4.166548252105713, "learning_rate": 2.17033009704671e-06, "loss": 2.0252, "step": 1165 }, { "epoch": 0.556829035339064, "grad_norm": 5.36815881729126, "learning_rate": 2.1664972112881183e-06, "loss": 1.1448, "step": 1166 }, { "epoch": 0.5573065902578797, "grad_norm": 4.367137432098389, "learning_rate": 2.1626651234861464e-06, "loss": 1.6008, "step": 1167 }, { "epoch": 0.5577841451766953, "grad_norm": 3.892559051513672, "learning_rate": 2.1588338428096527e-06, "loss": 0.7683, "step": 1168 }, { "epoch": 0.558261700095511, "grad_norm": 4.342893600463867, "learning_rate": 2.155003378425564e-06, "loss": 0.847, "step": 1169 }, { "epoch": 0.5587392550143266, "grad_norm": 4.833085536956787, "learning_rate": 2.1511737394988545e-06, "loss": 2.2542, "step": 1170 }, { "epoch": 0.5592168099331423, "grad_norm": 5.1873040199279785, "learning_rate": 2.1473449351925228e-06, "loss": 1.0362, "step": 1171 }, { "epoch": 0.559694364851958, "grad_norm": 5.147922515869141, "learning_rate": 2.143516974667571e-06, "loss": 0.8237, "step": 1172 }, { "epoch": 0.5601719197707736, "grad_norm": 4.510603427886963, "learning_rate": 2.1396898670829828e-06, "loss": 1.2334, "step": 1173 }, { "epoch": 0.5606494746895893, "grad_norm": 3.98628306388855, "learning_rate": 2.1358636215957007e-06, "loss": 1.3163, "step": 1174 }, { "epoch": 0.561127029608405, "grad_norm": 5.946235179901123, "learning_rate": 2.132038247360604e-06, "loss": 1.7245, "step": 1175 }, { "epoch": 0.5616045845272206, "grad_norm": 4.046076774597168, "learning_rate": 2.1282137535304874e-06, "loss": 1.1566, "step": 1176 }, { "epoch": 0.5620821394460362, "grad_norm": 6.763603687286377, "learning_rate": 2.1243901492560404e-06, "loss": 0.9439, "step": 1177 }, { "epoch": 0.562559694364852, "grad_norm": 4.9007039070129395, "learning_rate": 2.120567443685823e-06, "loss": 1.7652, "step": 1178 }, { "epoch": 0.5630372492836676, "grad_norm": 4.535945415496826, "learning_rate": 2.1167456459662447e-06, "loss": 1.091, "step": 1179 }, { "epoch": 0.5635148042024832, "grad_norm": 4.3074798583984375, "learning_rate": 2.1129247652415434e-06, "loss": 1.9295, "step": 1180 }, { "epoch": 0.563992359121299, "grad_norm": 4.822092056274414, "learning_rate": 2.1091048106537626e-06, "loss": 2.2009, "step": 1181 }, { "epoch": 0.5644699140401146, "grad_norm": 4.161897659301758, "learning_rate": 2.1052857913427305e-06, "loss": 1.0164, "step": 1182 }, { "epoch": 0.5649474689589303, "grad_norm": 4.319812774658203, "learning_rate": 2.1014677164460377e-06, "loss": 1.7447, "step": 1183 }, { "epoch": 0.565425023877746, "grad_norm": 4.489819526672363, "learning_rate": 2.097650595099013e-06, "loss": 1.1166, "step": 1184 }, { "epoch": 0.5659025787965616, "grad_norm": 4.653066635131836, "learning_rate": 2.093834436434706e-06, "loss": 1.305, "step": 1185 }, { "epoch": 0.5663801337153773, "grad_norm": 4.72034215927124, "learning_rate": 2.0900192495838617e-06, "loss": 1.285, "step": 1186 }, { "epoch": 0.566857688634193, "grad_norm": 3.8235199451446533, "learning_rate": 2.086205043674901e-06, "loss": 1.8785, "step": 1187 }, { "epoch": 0.5673352435530086, "grad_norm": 4.188753604888916, "learning_rate": 2.0823918278338974e-06, "loss": 0.8232, "step": 1188 }, { "epoch": 0.5678127984718243, "grad_norm": 4.2751946449279785, "learning_rate": 2.078579611184554e-06, "loss": 1.0179, "step": 1189 }, { "epoch": 0.5682903533906399, "grad_norm": 4.727127552032471, "learning_rate": 2.0747684028481845e-06, "loss": 0.9191, "step": 1190 }, { "epoch": 0.5687679083094556, "grad_norm": 4.512445449829102, "learning_rate": 2.0709582119436907e-06, "loss": 1.453, "step": 1191 }, { "epoch": 0.5692454632282713, "grad_norm": 4.297692775726318, "learning_rate": 2.0671490475875395e-06, "loss": 1.5874, "step": 1192 }, { "epoch": 0.5697230181470869, "grad_norm": 4.383309841156006, "learning_rate": 2.0633409188937402e-06, "loss": 0.9115, "step": 1193 }, { "epoch": 0.5702005730659025, "grad_norm": 4.620205402374268, "learning_rate": 2.059533834973826e-06, "loss": 1.536, "step": 1194 }, { "epoch": 0.5706781279847183, "grad_norm": 4.698455333709717, "learning_rate": 2.055727804936829e-06, "loss": 1.404, "step": 1195 }, { "epoch": 0.5711556829035339, "grad_norm": 4.528055667877197, "learning_rate": 2.0519228378892613e-06, "loss": 1.2067, "step": 1196 }, { "epoch": 0.5716332378223495, "grad_norm": 4.664231300354004, "learning_rate": 2.0481189429350903e-06, "loss": 1.355, "step": 1197 }, { "epoch": 0.5721107927411653, "grad_norm": 5.249608516693115, "learning_rate": 2.0443161291757187e-06, "loss": 1.0084, "step": 1198 }, { "epoch": 0.5725883476599809, "grad_norm": 4.057953834533691, "learning_rate": 2.0405144057099612e-06, "loss": 0.8064, "step": 1199 }, { "epoch": 0.5730659025787965, "grad_norm": 4.197903156280518, "learning_rate": 2.0367137816340264e-06, "loss": 0.8295, "step": 1200 }, { "epoch": 0.5735434574976123, "grad_norm": 4.119871616363525, "learning_rate": 2.0329142660414906e-06, "loss": 0.8628, "step": 1201 }, { "epoch": 0.5740210124164279, "grad_norm": 4.613128185272217, "learning_rate": 2.0291158680232766e-06, "loss": 1.174, "step": 1202 }, { "epoch": 0.5744985673352435, "grad_norm": 3.304452896118164, "learning_rate": 2.0253185966676365e-06, "loss": 1.2426, "step": 1203 }, { "epoch": 0.5749761222540593, "grad_norm": 5.2847161293029785, "learning_rate": 2.0215224610601238e-06, "loss": 1.6236, "step": 1204 }, { "epoch": 0.5754536771728749, "grad_norm": 5.700151443481445, "learning_rate": 2.0177274702835766e-06, "loss": 1.4959, "step": 1205 }, { "epoch": 0.5759312320916905, "grad_norm": 4.319638252258301, "learning_rate": 2.0139336334180927e-06, "loss": 1.246, "step": 1206 }, { "epoch": 0.5764087870105062, "grad_norm": 4.555635929107666, "learning_rate": 2.0101409595410085e-06, "loss": 1.722, "step": 1207 }, { "epoch": 0.5768863419293219, "grad_norm": 3.8217411041259766, "learning_rate": 2.0063494577268787e-06, "loss": 1.5941, "step": 1208 }, { "epoch": 0.5773638968481375, "grad_norm": 4.560518264770508, "learning_rate": 2.002559137047454e-06, "loss": 1.3914, "step": 1209 }, { "epoch": 0.5778414517669532, "grad_norm": 5.778707504272461, "learning_rate": 1.998770006571658e-06, "loss": 1.2955, "step": 1210 }, { "epoch": 0.5783190066857689, "grad_norm": 4.986813545227051, "learning_rate": 1.994982075365568e-06, "loss": 1.8521, "step": 1211 }, { "epoch": 0.5787965616045845, "grad_norm": 5.502147197723389, "learning_rate": 1.9911953524923893e-06, "loss": 0.8243, "step": 1212 }, { "epoch": 0.5792741165234002, "grad_norm": 6.174061298370361, "learning_rate": 1.9874098470124396e-06, "loss": 1.9445, "step": 1213 }, { "epoch": 0.5797516714422158, "grad_norm": 5.044993877410889, "learning_rate": 1.9836255679831203e-06, "loss": 2.2914, "step": 1214 }, { "epoch": 0.5802292263610315, "grad_norm": 4.498920917510986, "learning_rate": 1.9798425244589006e-06, "loss": 1.5142, "step": 1215 }, { "epoch": 0.5807067812798472, "grad_norm": 6.010688304901123, "learning_rate": 1.976060725491293e-06, "loss": 1.0001, "step": 1216 }, { "epoch": 0.5811843361986628, "grad_norm": 4.815806865692139, "learning_rate": 1.972280180128831e-06, "loss": 1.1118, "step": 1217 }, { "epoch": 0.5816618911174785, "grad_norm": 5.053376197814941, "learning_rate": 1.968500897417052e-06, "loss": 1.7539, "step": 1218 }, { "epoch": 0.5821394460362942, "grad_norm": 4.125682353973389, "learning_rate": 1.964722886398468e-06, "loss": 1.9532, "step": 1219 }, { "epoch": 0.5826170009551098, "grad_norm": 4.7024245262146, "learning_rate": 1.9609461561125502e-06, "loss": 1.7238, "step": 1220 }, { "epoch": 0.5830945558739254, "grad_norm": 3.794792413711548, "learning_rate": 1.9571707155957075e-06, "loss": 0.8126, "step": 1221 }, { "epoch": 0.5835721107927412, "grad_norm": 4.8270721435546875, "learning_rate": 1.95339657388126e-06, "loss": 1.3467, "step": 1222 }, { "epoch": 0.5840496657115568, "grad_norm": 4.798287391662598, "learning_rate": 1.94962373999942e-06, "loss": 0.92, "step": 1223 }, { "epoch": 0.5845272206303725, "grad_norm": 3.972766876220703, "learning_rate": 1.945852222977273e-06, "loss": 0.9484, "step": 1224 }, { "epoch": 0.5850047755491882, "grad_norm": 4.627545356750488, "learning_rate": 1.942082031838752e-06, "loss": 1.2671, "step": 1225 }, { "epoch": 0.5854823304680038, "grad_norm": 4.783227443695068, "learning_rate": 1.9383131756046182e-06, "loss": 2.1932, "step": 1226 }, { "epoch": 0.5859598853868195, "grad_norm": 5.343029499053955, "learning_rate": 1.93454566329244e-06, "loss": 1.0579, "step": 1227 }, { "epoch": 0.5864374403056352, "grad_norm": 7.094262599945068, "learning_rate": 1.9307795039165664e-06, "loss": 0.96, "step": 1228 }, { "epoch": 0.5869149952244508, "grad_norm": 4.6872124671936035, "learning_rate": 1.9270147064881134e-06, "loss": 1.8473, "step": 1229 }, { "epoch": 0.5873925501432665, "grad_norm": 4.566101551055908, "learning_rate": 1.9232512800149367e-06, "loss": 1.5348, "step": 1230 }, { "epoch": 0.5878701050620821, "grad_norm": 4.299230575561523, "learning_rate": 1.919489233501612e-06, "loss": 1.3402, "step": 1231 }, { "epoch": 0.5883476599808978, "grad_norm": 4.778106212615967, "learning_rate": 1.915728575949413e-06, "loss": 0.9801, "step": 1232 }, { "epoch": 0.5888252148997135, "grad_norm": 4.868286609649658, "learning_rate": 1.9119693163562897e-06, "loss": 2.1408, "step": 1233 }, { "epoch": 0.5893027698185291, "grad_norm": 4.4946370124816895, "learning_rate": 1.908211463716848e-06, "loss": 1.7155, "step": 1234 }, { "epoch": 0.5897803247373448, "grad_norm": 4.677528381347656, "learning_rate": 1.9044550270223277e-06, "loss": 1.0233, "step": 1235 }, { "epoch": 0.5902578796561605, "grad_norm": 5.962925434112549, "learning_rate": 1.9007000152605804e-06, "loss": 1.2607, "step": 1236 }, { "epoch": 0.5907354345749761, "grad_norm": 5.774280548095703, "learning_rate": 1.8969464374160471e-06, "loss": 1.3937, "step": 1237 }, { "epoch": 0.5912129894937918, "grad_norm": 5.0029120445251465, "learning_rate": 1.8931943024697397e-06, "loss": 1.7955, "step": 1238 }, { "epoch": 0.5916905444126075, "grad_norm": 4.833498477935791, "learning_rate": 1.8894436193992172e-06, "loss": 2.0079, "step": 1239 }, { "epoch": 0.5921680993314231, "grad_norm": 5.133493900299072, "learning_rate": 1.885694397178564e-06, "loss": 0.988, "step": 1240 }, { "epoch": 0.5926456542502387, "grad_norm": 5.489390850067139, "learning_rate": 1.8819466447783713e-06, "loss": 1.0373, "step": 1241 }, { "epoch": 0.5931232091690545, "grad_norm": 3.5521252155303955, "learning_rate": 1.8782003711657104e-06, "loss": 1.4252, "step": 1242 }, { "epoch": 0.5936007640878701, "grad_norm": 4.726348400115967, "learning_rate": 1.8744555853041163e-06, "loss": 1.3937, "step": 1243 }, { "epoch": 0.5940783190066857, "grad_norm": 5.209590435028076, "learning_rate": 1.8707122961535648e-06, "loss": 1.3754, "step": 1244 }, { "epoch": 0.5945558739255015, "grad_norm": 3.035463809967041, "learning_rate": 1.8669705126704495e-06, "loss": 0.6076, "step": 1245 }, { "epoch": 0.5950334288443171, "grad_norm": 4.979222297668457, "learning_rate": 1.8632302438075618e-06, "loss": 0.9762, "step": 1246 }, { "epoch": 0.5955109837631327, "grad_norm": 4.130837440490723, "learning_rate": 1.8594914985140688e-06, "loss": 1.6213, "step": 1247 }, { "epoch": 0.5959885386819485, "grad_norm": 5.357909679412842, "learning_rate": 1.8557542857354929e-06, "loss": 1.5388, "step": 1248 }, { "epoch": 0.5964660936007641, "grad_norm": 6.219747543334961, "learning_rate": 1.8520186144136892e-06, "loss": 1.7781, "step": 1249 }, { "epoch": 0.5969436485195797, "grad_norm": 4.795467376708984, "learning_rate": 1.8482844934868258e-06, "loss": 2.211, "step": 1250 }, { "epoch": 0.5974212034383954, "grad_norm": 5.082950115203857, "learning_rate": 1.8445519318893587e-06, "loss": 1.9224, "step": 1251 }, { "epoch": 0.5978987583572111, "grad_norm": 4.799416542053223, "learning_rate": 1.8408209385520153e-06, "loss": 0.9217, "step": 1252 }, { "epoch": 0.5983763132760267, "grad_norm": 4.11672306060791, "learning_rate": 1.8370915224017694e-06, "loss": 0.8793, "step": 1253 }, { "epoch": 0.5988538681948424, "grad_norm": 4.270621299743652, "learning_rate": 1.8333636923618223e-06, "loss": 0.8222, "step": 1254 }, { "epoch": 0.5993314231136581, "grad_norm": 5.756953716278076, "learning_rate": 1.8296374573515795e-06, "loss": 1.7276, "step": 1255 }, { "epoch": 0.5998089780324737, "grad_norm": 4.5987396240234375, "learning_rate": 1.8259128262866306e-06, "loss": 1.4642, "step": 1256 }, { "epoch": 0.6002865329512894, "grad_norm": 4.254241943359375, "learning_rate": 1.822189808078727e-06, "loss": 0.7192, "step": 1257 }, { "epoch": 0.600764087870105, "grad_norm": 4.904393672943115, "learning_rate": 1.818468411635761e-06, "loss": 1.2869, "step": 1258 }, { "epoch": 0.6012416427889207, "grad_norm": 4.1930646896362305, "learning_rate": 1.814748645861745e-06, "loss": 1.3765, "step": 1259 }, { "epoch": 0.6017191977077364, "grad_norm": 5.3744282722473145, "learning_rate": 1.8110305196567895e-06, "loss": 1.6429, "step": 1260 }, { "epoch": 0.602196752626552, "grad_norm": 5.253814697265625, "learning_rate": 1.8073140419170826e-06, "loss": 1.6336, "step": 1261 }, { "epoch": 0.6026743075453678, "grad_norm": 5.350124359130859, "learning_rate": 1.803599221534868e-06, "loss": 1.707, "step": 1262 }, { "epoch": 0.6031518624641834, "grad_norm": 4.950552463531494, "learning_rate": 1.799886067398423e-06, "loss": 1.4077, "step": 1263 }, { "epoch": 0.603629417382999, "grad_norm": 4.966360569000244, "learning_rate": 1.7961745883920392e-06, "loss": 0.8991, "step": 1264 }, { "epoch": 0.6041069723018148, "grad_norm": 4.509434223175049, "learning_rate": 1.7924647933960003e-06, "loss": 0.8313, "step": 1265 }, { "epoch": 0.6045845272206304, "grad_norm": 4.5122389793396, "learning_rate": 1.78875669128656e-06, "loss": 1.7787, "step": 1266 }, { "epoch": 0.605062082139446, "grad_norm": 4.619853496551514, "learning_rate": 1.7850502909359216e-06, "loss": 1.359, "step": 1267 }, { "epoch": 0.6055396370582617, "grad_norm": 4.557889938354492, "learning_rate": 1.781345601212217e-06, "loss": 1.0351, "step": 1268 }, { "epoch": 0.6060171919770774, "grad_norm": 4.944577217102051, "learning_rate": 1.7776426309794856e-06, "loss": 1.4135, "step": 1269 }, { "epoch": 0.606494746895893, "grad_norm": 4.0351033210754395, "learning_rate": 1.7739413890976514e-06, "loss": 1.1683, "step": 1270 }, { "epoch": 0.6069723018147087, "grad_norm": 4.017764091491699, "learning_rate": 1.770241884422505e-06, "loss": 1.4904, "step": 1271 }, { "epoch": 0.6074498567335244, "grad_norm": 4.159587383270264, "learning_rate": 1.766544125805678e-06, "loss": 1.9261, "step": 1272 }, { "epoch": 0.60792741165234, "grad_norm": 4.243263244628906, "learning_rate": 1.7628481220946258e-06, "loss": 1.5359, "step": 1273 }, { "epoch": 0.6084049665711557, "grad_norm": 4.66589879989624, "learning_rate": 1.7591538821326052e-06, "loss": 1.907, "step": 1274 }, { "epoch": 0.6088825214899714, "grad_norm": 5.210451602935791, "learning_rate": 1.7554614147586518e-06, "loss": 2.0177, "step": 1275 }, { "epoch": 0.609360076408787, "grad_norm": 5.1403656005859375, "learning_rate": 1.7517707288075617e-06, "loss": 1.3189, "step": 1276 }, { "epoch": 0.6098376313276027, "grad_norm": 3.8888590335845947, "learning_rate": 1.748081833109866e-06, "loss": 1.4506, "step": 1277 }, { "epoch": 0.6103151862464183, "grad_norm": 4.7832350730896, "learning_rate": 1.7443947364918152e-06, "loss": 1.3169, "step": 1278 }, { "epoch": 0.610792741165234, "grad_norm": 3.6932289600372314, "learning_rate": 1.7407094477753534e-06, "loss": 0.9538, "step": 1279 }, { "epoch": 0.6112702960840497, "grad_norm": 5.245924472808838, "learning_rate": 1.7370259757781003e-06, "loss": 1.2964, "step": 1280 }, { "epoch": 0.6117478510028653, "grad_norm": 4.341866970062256, "learning_rate": 1.733344329313327e-06, "loss": 1.2135, "step": 1281 }, { "epoch": 0.612225405921681, "grad_norm": 4.3943986892700195, "learning_rate": 1.7296645171899385e-06, "loss": 1.0208, "step": 1282 }, { "epoch": 0.6127029608404967, "grad_norm": 3.4464380741119385, "learning_rate": 1.7259865482124505e-06, "loss": 0.9605, "step": 1283 }, { "epoch": 0.6131805157593123, "grad_norm": 5.047428607940674, "learning_rate": 1.7223104311809678e-06, "loss": 1.1448, "step": 1284 }, { "epoch": 0.6136580706781279, "grad_norm": 3.4936885833740234, "learning_rate": 1.7186361748911656e-06, "loss": 0.7742, "step": 1285 }, { "epoch": 0.6141356255969437, "grad_norm": 4.4178147315979, "learning_rate": 1.714963788134265e-06, "loss": 1.4708, "step": 1286 }, { "epoch": 0.6146131805157593, "grad_norm": 4.184749603271484, "learning_rate": 1.7112932796970155e-06, "loss": 1.6668, "step": 1287 }, { "epoch": 0.6150907354345749, "grad_norm": 5.140230178833008, "learning_rate": 1.7076246583616721e-06, "loss": 0.7272, "step": 1288 }, { "epoch": 0.6155682903533907, "grad_norm": 5.823448657989502, "learning_rate": 1.7039579329059752e-06, "loss": 1.7015, "step": 1289 }, { "epoch": 0.6160458452722063, "grad_norm": 4.750666618347168, "learning_rate": 1.7002931121031272e-06, "loss": 0.8906, "step": 1290 }, { "epoch": 0.6165234001910219, "grad_norm": 4.336917877197266, "learning_rate": 1.6966302047217748e-06, "loss": 1.5243, "step": 1291 }, { "epoch": 0.6170009551098377, "grad_norm": 5.00648832321167, "learning_rate": 1.6929692195259867e-06, "loss": 2.0127, "step": 1292 }, { "epoch": 0.6174785100286533, "grad_norm": 4.4711408615112305, "learning_rate": 1.6893101652752315e-06, "loss": 1.1063, "step": 1293 }, { "epoch": 0.6179560649474689, "grad_norm": 4.771127700805664, "learning_rate": 1.685653050724359e-06, "loss": 1.0958, "step": 1294 }, { "epoch": 0.6184336198662846, "grad_norm": 4.893413066864014, "learning_rate": 1.6819978846235758e-06, "loss": 1.6755, "step": 1295 }, { "epoch": 0.6189111747851003, "grad_norm": 4.314020156860352, "learning_rate": 1.678344675718428e-06, "loss": 1.6112, "step": 1296 }, { "epoch": 0.6193887297039159, "grad_norm": 4.617092609405518, "learning_rate": 1.674693432749781e-06, "loss": 1.6527, "step": 1297 }, { "epoch": 0.6198662846227316, "grad_norm": 3.760847806930542, "learning_rate": 1.6710441644537916e-06, "loss": 1.4087, "step": 1298 }, { "epoch": 0.6203438395415473, "grad_norm": 4.860892295837402, "learning_rate": 1.6673968795618957e-06, "loss": 1.0661, "step": 1299 }, { "epoch": 0.620821394460363, "grad_norm": 4.699859142303467, "learning_rate": 1.663751586800782e-06, "loss": 1.7034, "step": 1300 }, { "epoch": 0.6212989493791786, "grad_norm": 4.952959060668945, "learning_rate": 1.6601082948923736e-06, "loss": 2.0043, "step": 1301 }, { "epoch": 0.6217765042979942, "grad_norm": 5.00866174697876, "learning_rate": 1.6564670125538042e-06, "loss": 1.7785, "step": 1302 }, { "epoch": 0.62225405921681, "grad_norm": 3.7625598907470703, "learning_rate": 1.652827748497402e-06, "loss": 1.2957, "step": 1303 }, { "epoch": 0.6227316141356256, "grad_norm": 5.2958221435546875, "learning_rate": 1.6491905114306644e-06, "loss": 2.0432, "step": 1304 }, { "epoch": 0.6232091690544412, "grad_norm": 4.675695419311523, "learning_rate": 1.6455553100562399e-06, "loss": 1.9158, "step": 1305 }, { "epoch": 0.623686723973257, "grad_norm": 5.072800159454346, "learning_rate": 1.6419221530719062e-06, "loss": 1.3123, "step": 1306 }, { "epoch": 0.6241642788920726, "grad_norm": 4.605288982391357, "learning_rate": 1.6382910491705478e-06, "loss": 1.0114, "step": 1307 }, { "epoch": 0.6246418338108882, "grad_norm": 5.483273029327393, "learning_rate": 1.6346620070401391e-06, "loss": 1.9219, "step": 1308 }, { "epoch": 0.625119388729704, "grad_norm": 5.493833065032959, "learning_rate": 1.6310350353637203e-06, "loss": 0.924, "step": 1309 }, { "epoch": 0.6255969436485196, "grad_norm": 4.843364715576172, "learning_rate": 1.6274101428193784e-06, "loss": 1.3587, "step": 1310 }, { "epoch": 0.6260744985673352, "grad_norm": 4.588925361633301, "learning_rate": 1.6237873380802244e-06, "loss": 1.6236, "step": 1311 }, { "epoch": 0.626552053486151, "grad_norm": 5.921639919281006, "learning_rate": 1.6201666298143753e-06, "loss": 1.4792, "step": 1312 }, { "epoch": 0.6270296084049666, "grad_norm": 3.8781142234802246, "learning_rate": 1.6165480266849314e-06, "loss": 1.2098, "step": 1313 }, { "epoch": 0.6275071633237822, "grad_norm": 4.941787242889404, "learning_rate": 1.6129315373499563e-06, "loss": 1.4419, "step": 1314 }, { "epoch": 0.6279847182425979, "grad_norm": 4.270072937011719, "learning_rate": 1.609317170462456e-06, "loss": 1.8004, "step": 1315 }, { "epoch": 0.6284622731614136, "grad_norm": 4.997530460357666, "learning_rate": 1.605704934670358e-06, "loss": 1.1224, "step": 1316 }, { "epoch": 0.6289398280802292, "grad_norm": 5.185428619384766, "learning_rate": 1.6020948386164905e-06, "loss": 1.6443, "step": 1317 }, { "epoch": 0.6294173829990449, "grad_norm": 4.136307239532471, "learning_rate": 1.5984868909385632e-06, "loss": 2.284, "step": 1318 }, { "epoch": 0.6298949379178606, "grad_norm": 4.9033589363098145, "learning_rate": 1.5948811002691445e-06, "loss": 1.7021, "step": 1319 }, { "epoch": 0.6303724928366762, "grad_norm": 5.019993305206299, "learning_rate": 1.5912774752356425e-06, "loss": 1.5704, "step": 1320 }, { "epoch": 0.6308500477554919, "grad_norm": 5.321417808532715, "learning_rate": 1.5876760244602823e-06, "loss": 2.0003, "step": 1321 }, { "epoch": 0.6313276026743075, "grad_norm": 3.893937587738037, "learning_rate": 1.5840767565600886e-06, "loss": 1.4685, "step": 1322 }, { "epoch": 0.6318051575931232, "grad_norm": 4.4527482986450195, "learning_rate": 1.5804796801468625e-06, "loss": 1.5907, "step": 1323 }, { "epoch": 0.6322827125119389, "grad_norm": 4.550018787384033, "learning_rate": 1.576884803827161e-06, "loss": 1.0624, "step": 1324 }, { "epoch": 0.6327602674307545, "grad_norm": 5.0203423500061035, "learning_rate": 1.573292136202278e-06, "loss": 0.9323, "step": 1325 }, { "epoch": 0.6332378223495702, "grad_norm": 4.200802326202393, "learning_rate": 1.5697016858682217e-06, "loss": 1.3467, "step": 1326 }, { "epoch": 0.6337153772683859, "grad_norm": 4.711724281311035, "learning_rate": 1.5661134614156962e-06, "loss": 1.4547, "step": 1327 }, { "epoch": 0.6341929321872015, "grad_norm": 4.66978645324707, "learning_rate": 1.5625274714300792e-06, "loss": 0.8861, "step": 1328 }, { "epoch": 0.6346704871060171, "grad_norm": 4.666149616241455, "learning_rate": 1.558943724491403e-06, "loss": 1.0294, "step": 1329 }, { "epoch": 0.6351480420248329, "grad_norm": 5.055423736572266, "learning_rate": 1.5553622291743309e-06, "loss": 1.5887, "step": 1330 }, { "epoch": 0.6356255969436485, "grad_norm": 4.126704692840576, "learning_rate": 1.5517829940481407e-06, "loss": 0.7301, "step": 1331 }, { "epoch": 0.6361031518624641, "grad_norm": 4.873147487640381, "learning_rate": 1.5482060276767025e-06, "loss": 1.6116, "step": 1332 }, { "epoch": 0.6365807067812799, "grad_norm": 5.697768211364746, "learning_rate": 1.5446313386184575e-06, "loss": 1.645, "step": 1333 }, { "epoch": 0.6370582617000955, "grad_norm": 5.1532769203186035, "learning_rate": 1.5410589354263972e-06, "loss": 1.2274, "step": 1334 }, { "epoch": 0.6375358166189111, "grad_norm": 4.345121383666992, "learning_rate": 1.5374888266480452e-06, "loss": 1.144, "step": 1335 }, { "epoch": 0.6380133715377269, "grad_norm": 5.162374019622803, "learning_rate": 1.5339210208254345e-06, "loss": 0.9567, "step": 1336 }, { "epoch": 0.6384909264565425, "grad_norm": 4.381374359130859, "learning_rate": 1.5303555264950887e-06, "loss": 0.997, "step": 1337 }, { "epoch": 0.6389684813753582, "grad_norm": 5.745780944824219, "learning_rate": 1.5267923521880008e-06, "loss": 1.5618, "step": 1338 }, { "epoch": 0.6394460362941738, "grad_norm": 4.2585954666137695, "learning_rate": 1.5232315064296094e-06, "loss": 1.3984, "step": 1339 }, { "epoch": 0.6399235912129895, "grad_norm": 4.717815399169922, "learning_rate": 1.5196729977397884e-06, "loss": 1.5151, "step": 1340 }, { "epoch": 0.6404011461318052, "grad_norm": 4.661210060119629, "learning_rate": 1.5161168346328148e-06, "loss": 1.6341, "step": 1341 }, { "epoch": 0.6408787010506208, "grad_norm": 4.756438255310059, "learning_rate": 1.5125630256173542e-06, "loss": 1.6522, "step": 1342 }, { "epoch": 0.6413562559694365, "grad_norm": 4.602748394012451, "learning_rate": 1.5090115791964405e-06, "loss": 1.1289, "step": 1343 }, { "epoch": 0.6418338108882522, "grad_norm": 4.666916847229004, "learning_rate": 1.5054625038674548e-06, "loss": 1.3419, "step": 1344 }, { "epoch": 0.6423113658070678, "grad_norm": 4.687642574310303, "learning_rate": 1.5019158081221046e-06, "loss": 1.0137, "step": 1345 }, { "epoch": 0.6427889207258835, "grad_norm": 5.313892364501953, "learning_rate": 1.4983715004464034e-06, "loss": 1.1861, "step": 1346 }, { "epoch": 0.6432664756446992, "grad_norm": 5.469970226287842, "learning_rate": 1.4948295893206512e-06, "loss": 1.5522, "step": 1347 }, { "epoch": 0.6437440305635148, "grad_norm": 4.091184139251709, "learning_rate": 1.4912900832194151e-06, "loss": 1.8134, "step": 1348 }, { "epoch": 0.6442215854823304, "grad_norm": 4.073487281799316, "learning_rate": 1.4877529906115062e-06, "loss": 1.7151, "step": 1349 }, { "epoch": 0.6446991404011462, "grad_norm": 4.337020397186279, "learning_rate": 1.4842183199599625e-06, "loss": 1.8044, "step": 1350 }, { "epoch": 0.6451766953199618, "grad_norm": 5.233493804931641, "learning_rate": 1.4806860797220245e-06, "loss": 2.1434, "step": 1351 }, { "epoch": 0.6456542502387774, "grad_norm": 4.981617450714111, "learning_rate": 1.4771562783491201e-06, "loss": 0.8677, "step": 1352 }, { "epoch": 0.6461318051575932, "grad_norm": 4.93593692779541, "learning_rate": 1.4736289242868412e-06, "loss": 1.7329, "step": 1353 }, { "epoch": 0.6466093600764088, "grad_norm": 4.01775598526001, "learning_rate": 1.470104025974924e-06, "loss": 0.7851, "step": 1354 }, { "epoch": 0.6470869149952244, "grad_norm": 4.788083076477051, "learning_rate": 1.4665815918472277e-06, "loss": 1.5721, "step": 1355 }, { "epoch": 0.6475644699140402, "grad_norm": 3.886212110519409, "learning_rate": 1.463061630331718e-06, "loss": 1.2985, "step": 1356 }, { "epoch": 0.6480420248328558, "grad_norm": 5.082647800445557, "learning_rate": 1.459544149850442e-06, "loss": 1.759, "step": 1357 }, { "epoch": 0.6485195797516714, "grad_norm": 4.685556411743164, "learning_rate": 1.456029158819513e-06, "loss": 0.9629, "step": 1358 }, { "epoch": 0.6489971346704871, "grad_norm": 5.46451997756958, "learning_rate": 1.4525166656490859e-06, "loss": 1.4606, "step": 1359 }, { "epoch": 0.6494746895893028, "grad_norm": 5.869678974151611, "learning_rate": 1.4490066787433397e-06, "loss": 1.2451, "step": 1360 }, { "epoch": 0.6499522445081184, "grad_norm": 5.2629923820495605, "learning_rate": 1.445499206500458e-06, "loss": 1.4511, "step": 1361 }, { "epoch": 0.6504297994269341, "grad_norm": 4.309606075286865, "learning_rate": 1.441994257312605e-06, "loss": 1.2329, "step": 1362 }, { "epoch": 0.6509073543457498, "grad_norm": 4.907747745513916, "learning_rate": 1.4384918395659108e-06, "loss": 1.3956, "step": 1363 }, { "epoch": 0.6513849092645654, "grad_norm": 5.122812271118164, "learning_rate": 1.4349919616404467e-06, "loss": 0.8988, "step": 1364 }, { "epoch": 0.6518624641833811, "grad_norm": 5.255880832672119, "learning_rate": 1.4314946319102086e-06, "loss": 0.9818, "step": 1365 }, { "epoch": 0.6523400191021967, "grad_norm": 3.683654546737671, "learning_rate": 1.4279998587430944e-06, "loss": 1.1334, "step": 1366 }, { "epoch": 0.6528175740210124, "grad_norm": 4.149895191192627, "learning_rate": 1.4245076505008858e-06, "loss": 1.1418, "step": 1367 }, { "epoch": 0.6532951289398281, "grad_norm": 5.53774881362915, "learning_rate": 1.4210180155392267e-06, "loss": 1.8129, "step": 1368 }, { "epoch": 0.6537726838586437, "grad_norm": 4.944370269775391, "learning_rate": 1.4175309622076047e-06, "loss": 1.5372, "step": 1369 }, { "epoch": 0.6542502387774594, "grad_norm": 5.499810695648193, "learning_rate": 1.4140464988493301e-06, "loss": 1.9413, "step": 1370 }, { "epoch": 0.6547277936962751, "grad_norm": 4.564394950866699, "learning_rate": 1.4105646338015177e-06, "loss": 1.1931, "step": 1371 }, { "epoch": 0.6552053486150907, "grad_norm": 4.022624492645264, "learning_rate": 1.4070853753950615e-06, "loss": 1.2446, "step": 1372 }, { "epoch": 0.6556829035339063, "grad_norm": 4.255367279052734, "learning_rate": 1.4036087319546233e-06, "loss": 1.377, "step": 1373 }, { "epoch": 0.6561604584527221, "grad_norm": 4.46012020111084, "learning_rate": 1.4001347117986053e-06, "loss": 0.8899, "step": 1374 }, { "epoch": 0.6566380133715377, "grad_norm": 4.6805572509765625, "learning_rate": 1.396663323239134e-06, "loss": 1.7551, "step": 1375 }, { "epoch": 0.6571155682903533, "grad_norm": 5.0894975662231445, "learning_rate": 1.3931945745820407e-06, "loss": 1.7448, "step": 1376 }, { "epoch": 0.6575931232091691, "grad_norm": 4.1829833984375, "learning_rate": 1.3897284741268376e-06, "loss": 2.0927, "step": 1377 }, { "epoch": 0.6580706781279847, "grad_norm": 5.269598484039307, "learning_rate": 1.386265030166703e-06, "loss": 1.5281, "step": 1378 }, { "epoch": 0.6585482330468004, "grad_norm": 4.006648063659668, "learning_rate": 1.382804250988458e-06, "loss": 1.5431, "step": 1379 }, { "epoch": 0.6590257879656161, "grad_norm": 5.079704761505127, "learning_rate": 1.3793461448725494e-06, "loss": 1.6014, "step": 1380 }, { "epoch": 0.6595033428844317, "grad_norm": 5.154187202453613, "learning_rate": 1.3758907200930244e-06, "loss": 0.9359, "step": 1381 }, { "epoch": 0.6599808978032474, "grad_norm": 5.838554382324219, "learning_rate": 1.3724379849175174e-06, "loss": 1.1044, "step": 1382 }, { "epoch": 0.660458452722063, "grad_norm": 5.147481441497803, "learning_rate": 1.3689879476072296e-06, "loss": 0.8365, "step": 1383 }, { "epoch": 0.6609360076408787, "grad_norm": 4.821963310241699, "learning_rate": 1.3655406164169035e-06, "loss": 0.7149, "step": 1384 }, { "epoch": 0.6614135625596944, "grad_norm": 3.6714906692504883, "learning_rate": 1.3620959995948086e-06, "loss": 1.4536, "step": 1385 }, { "epoch": 0.66189111747851, "grad_norm": 6.214732646942139, "learning_rate": 1.358654105382719e-06, "loss": 0.9585, "step": 1386 }, { "epoch": 0.6623686723973257, "grad_norm": 5.2055487632751465, "learning_rate": 1.3552149420158966e-06, "loss": 1.5222, "step": 1387 }, { "epoch": 0.6628462273161414, "grad_norm": 4.872779369354248, "learning_rate": 1.3517785177230652e-06, "loss": 1.1413, "step": 1388 }, { "epoch": 0.663323782234957, "grad_norm": 5.17338752746582, "learning_rate": 1.3483448407263988e-06, "loss": 1.5445, "step": 1389 }, { "epoch": 0.6638013371537727, "grad_norm": 4.616220951080322, "learning_rate": 1.344913919241496e-06, "loss": 1.3202, "step": 1390 }, { "epoch": 0.6642788920725884, "grad_norm": 3.996260643005371, "learning_rate": 1.3414857614773646e-06, "loss": 1.3582, "step": 1391 }, { "epoch": 0.664756446991404, "grad_norm": 5.382381439208984, "learning_rate": 1.338060375636397e-06, "loss": 1.6282, "step": 1392 }, { "epoch": 0.6652340019102196, "grad_norm": 4.346777439117432, "learning_rate": 1.3346377699143547e-06, "loss": 1.8367, "step": 1393 }, { "epoch": 0.6657115568290354, "grad_norm": 7.835202693939209, "learning_rate": 1.3312179525003483e-06, "loss": 0.8887, "step": 1394 }, { "epoch": 0.666189111747851, "grad_norm": 3.970013380050659, "learning_rate": 1.3278009315768147e-06, "loss": 0.833, "step": 1395 }, { "epoch": 0.6666666666666666, "grad_norm": 4.512973785400391, "learning_rate": 1.3243867153195033e-06, "loss": 1.0996, "step": 1396 }, { "epoch": 0.6671442215854824, "grad_norm": 4.302812576293945, "learning_rate": 1.3209753118974478e-06, "loss": 1.0952, "step": 1397 }, { "epoch": 0.667621776504298, "grad_norm": 4.397500514984131, "learning_rate": 1.3175667294729557e-06, "loss": 0.8585, "step": 1398 }, { "epoch": 0.6680993314231136, "grad_norm": 4.902798175811768, "learning_rate": 1.3141609762015838e-06, "loss": 1.1069, "step": 1399 }, { "epoch": 0.6685768863419294, "grad_norm": 4.781280517578125, "learning_rate": 1.31075806023212e-06, "loss": 0.8822, "step": 1400 }, { "epoch": 0.669054441260745, "grad_norm": 4.717187881469727, "learning_rate": 1.3073579897065624e-06, "loss": 1.001, "step": 1401 }, { "epoch": 0.6695319961795606, "grad_norm": 4.166487693786621, "learning_rate": 1.3039607727601023e-06, "loss": 1.5963, "step": 1402 }, { "epoch": 0.6700095510983763, "grad_norm": 4.866250038146973, "learning_rate": 1.3005664175211024e-06, "loss": 1.5191, "step": 1403 }, { "epoch": 0.670487106017192, "grad_norm": 4.169666290283203, "learning_rate": 1.297174932111079e-06, "loss": 1.4852, "step": 1404 }, { "epoch": 0.6709646609360076, "grad_norm": 7.143589973449707, "learning_rate": 1.2937863246446824e-06, "loss": 1.2295, "step": 1405 }, { "epoch": 0.6714422158548233, "grad_norm": 3.8113584518432617, "learning_rate": 1.290400603229674e-06, "loss": 1.303, "step": 1406 }, { "epoch": 0.671919770773639, "grad_norm": 4.575613975524902, "learning_rate": 1.2870177759669134e-06, "loss": 1.3793, "step": 1407 }, { "epoch": 0.6723973256924546, "grad_norm": 5.343461036682129, "learning_rate": 1.283637850950334e-06, "loss": 1.0979, "step": 1408 }, { "epoch": 0.6728748806112703, "grad_norm": 5.123992919921875, "learning_rate": 1.2802608362669256e-06, "loss": 0.94, "step": 1409 }, { "epoch": 0.673352435530086, "grad_norm": 6.319722652435303, "learning_rate": 1.2768867399967133e-06, "loss": 1.2498, "step": 1410 }, { "epoch": 0.6738299904489016, "grad_norm": 5.081634521484375, "learning_rate": 1.2735155702127411e-06, "loss": 0.9467, "step": 1411 }, { "epoch": 0.6743075453677173, "grad_norm": 4.147507667541504, "learning_rate": 1.2701473349810506e-06, "loss": 1.1314, "step": 1412 }, { "epoch": 0.6747851002865329, "grad_norm": 5.011214733123779, "learning_rate": 1.2667820423606609e-06, "loss": 1.8826, "step": 1413 }, { "epoch": 0.6752626552053486, "grad_norm": 4.568010330200195, "learning_rate": 1.2634197004035512e-06, "loss": 1.4824, "step": 1414 }, { "epoch": 0.6757402101241643, "grad_norm": 4.315800666809082, "learning_rate": 1.2600603171546421e-06, "loss": 1.5648, "step": 1415 }, { "epoch": 0.6762177650429799, "grad_norm": 4.358720779418945, "learning_rate": 1.2567039006517723e-06, "loss": 0.9054, "step": 1416 }, { "epoch": 0.6766953199617957, "grad_norm": 4.742010593414307, "learning_rate": 1.2533504589256833e-06, "loss": 1.2201, "step": 1417 }, { "epoch": 0.6771728748806113, "grad_norm": 4.473992824554443, "learning_rate": 1.2500000000000007e-06, "loss": 1.7572, "step": 1418 }, { "epoch": 0.6776504297994269, "grad_norm": 5.044095039367676, "learning_rate": 1.2466525318912107e-06, "loss": 1.0364, "step": 1419 }, { "epoch": 0.6781279847182426, "grad_norm": 4.467647552490234, "learning_rate": 1.2433080626086447e-06, "loss": 0.9038, "step": 1420 }, { "epoch": 0.6786055396370583, "grad_norm": 4.339723110198975, "learning_rate": 1.2399666001544594e-06, "loss": 1.6966, "step": 1421 }, { "epoch": 0.6790830945558739, "grad_norm": 5.6728925704956055, "learning_rate": 1.2366281525236165e-06, "loss": 0.6617, "step": 1422 }, { "epoch": 0.6795606494746896, "grad_norm": 4.681081771850586, "learning_rate": 1.2332927277038641e-06, "loss": 0.8966, "step": 1423 }, { "epoch": 0.6800382043935053, "grad_norm": 5.76029634475708, "learning_rate": 1.2299603336757196e-06, "loss": 1.41, "step": 1424 }, { "epoch": 0.6805157593123209, "grad_norm": 4.421393871307373, "learning_rate": 1.2266309784124453e-06, "loss": 1.7917, "step": 1425 }, { "epoch": 0.6809933142311366, "grad_norm": 3.6917643547058105, "learning_rate": 1.2233046698800343e-06, "loss": 0.7876, "step": 1426 }, { "epoch": 0.6814708691499523, "grad_norm": 4.381502151489258, "learning_rate": 1.219981416037193e-06, "loss": 1.3932, "step": 1427 }, { "epoch": 0.6819484240687679, "grad_norm": 4.129003047943115, "learning_rate": 1.2166612248353145e-06, "loss": 1.8062, "step": 1428 }, { "epoch": 0.6824259789875836, "grad_norm": 6.795845985412598, "learning_rate": 1.2133441042184667e-06, "loss": 1.5764, "step": 1429 }, { "epoch": 0.6829035339063992, "grad_norm": 5.152336597442627, "learning_rate": 1.2100300621233702e-06, "loss": 1.7225, "step": 1430 }, { "epoch": 0.6833810888252149, "grad_norm": 5.064678192138672, "learning_rate": 1.2067191064793792e-06, "loss": 1.6414, "step": 1431 }, { "epoch": 0.6838586437440306, "grad_norm": 4.824307918548584, "learning_rate": 1.2034112452084627e-06, "loss": 1.0408, "step": 1432 }, { "epoch": 0.6843361986628462, "grad_norm": 5.403545379638672, "learning_rate": 1.2001064862251869e-06, "loss": 1.4404, "step": 1433 }, { "epoch": 0.6848137535816619, "grad_norm": 3.7860655784606934, "learning_rate": 1.196804837436695e-06, "loss": 1.2915, "step": 1434 }, { "epoch": 0.6852913085004776, "grad_norm": 4.275695323944092, "learning_rate": 1.1935063067426885e-06, "loss": 2.2038, "step": 1435 }, { "epoch": 0.6857688634192932, "grad_norm": 3.908977508544922, "learning_rate": 1.1902109020354092e-06, "loss": 1.8973, "step": 1436 }, { "epoch": 0.6862464183381088, "grad_norm": 4.049367904663086, "learning_rate": 1.1869186311996179e-06, "loss": 0.9681, "step": 1437 }, { "epoch": 0.6867239732569246, "grad_norm": 4.881453037261963, "learning_rate": 1.183629502112578e-06, "loss": 1.7841, "step": 1438 }, { "epoch": 0.6872015281757402, "grad_norm": 4.828319549560547, "learning_rate": 1.1803435226440363e-06, "loss": 1.5398, "step": 1439 }, { "epoch": 0.6876790830945558, "grad_norm": 4.487203121185303, "learning_rate": 1.1770607006562041e-06, "loss": 2.262, "step": 1440 }, { "epoch": 0.6881566380133716, "grad_norm": 4.378173828125, "learning_rate": 1.1737810440037345e-06, "loss": 2.0023, "step": 1441 }, { "epoch": 0.6886341929321872, "grad_norm": 4.399718284606934, "learning_rate": 1.1705045605337107e-06, "loss": 1.9581, "step": 1442 }, { "epoch": 0.6891117478510028, "grad_norm": 5.169201850891113, "learning_rate": 1.1672312580856227e-06, "loss": 1.3738, "step": 1443 }, { "epoch": 0.6895893027698186, "grad_norm": 5.178003787994385, "learning_rate": 1.1639611444913486e-06, "loss": 1.1225, "step": 1444 }, { "epoch": 0.6900668576886342, "grad_norm": 5.3259806632995605, "learning_rate": 1.1606942275751376e-06, "loss": 1.5052, "step": 1445 }, { "epoch": 0.6905444126074498, "grad_norm": 5.658997058868408, "learning_rate": 1.1574305151535897e-06, "loss": 1.3704, "step": 1446 }, { "epoch": 0.6910219675262655, "grad_norm": 4.347021102905273, "learning_rate": 1.154170015035638e-06, "loss": 2.1729, "step": 1447 }, { "epoch": 0.6914995224450812, "grad_norm": 4.746464252471924, "learning_rate": 1.1509127350225296e-06, "loss": 1.6563, "step": 1448 }, { "epoch": 0.6919770773638968, "grad_norm": 4.785487651824951, "learning_rate": 1.1476586829078079e-06, "loss": 1.6738, "step": 1449 }, { "epoch": 0.6924546322827125, "grad_norm": 4.70460319519043, "learning_rate": 1.14440786647729e-06, "loss": 1.1252, "step": 1450 }, { "epoch": 0.6929321872015282, "grad_norm": 5.207343101501465, "learning_rate": 1.1411602935090546e-06, "loss": 1.6196, "step": 1451 }, { "epoch": 0.6934097421203438, "grad_norm": 4.695589542388916, "learning_rate": 1.1379159717734183e-06, "loss": 1.0519, "step": 1452 }, { "epoch": 0.6938872970391595, "grad_norm": 4.811611175537109, "learning_rate": 1.134674909032919e-06, "loss": 1.3178, "step": 1453 }, { "epoch": 0.6943648519579751, "grad_norm": 4.456569194793701, "learning_rate": 1.131437113042297e-06, "loss": 0.8492, "step": 1454 }, { "epoch": 0.6948424068767909, "grad_norm": 4.2948832511901855, "learning_rate": 1.1282025915484757e-06, "loss": 1.1546, "step": 1455 }, { "epoch": 0.6953199617956065, "grad_norm": 3.9289231300354004, "learning_rate": 1.124971352290545e-06, "loss": 1.5432, "step": 1456 }, { "epoch": 0.6957975167144221, "grad_norm": 3.4965667724609375, "learning_rate": 1.1217434029997404e-06, "loss": 0.8263, "step": 1457 }, { "epoch": 0.6962750716332379, "grad_norm": 3.7221903800964355, "learning_rate": 1.1185187513994263e-06, "loss": 0.8115, "step": 1458 }, { "epoch": 0.6967526265520535, "grad_norm": 4.80462646484375, "learning_rate": 1.115297405205078e-06, "loss": 1.848, "step": 1459 }, { "epoch": 0.6972301814708691, "grad_norm": 4.702608108520508, "learning_rate": 1.112079372124258e-06, "loss": 1.3289, "step": 1460 }, { "epoch": 0.6977077363896849, "grad_norm": 4.096073150634766, "learning_rate": 1.1088646598566064e-06, "loss": 2.0959, "step": 1461 }, { "epoch": 0.6981852913085005, "grad_norm": 5.059751510620117, "learning_rate": 1.1056532760938148e-06, "loss": 0.9773, "step": 1462 }, { "epoch": 0.6986628462273161, "grad_norm": 4.160397529602051, "learning_rate": 1.1024452285196128e-06, "loss": 1.4168, "step": 1463 }, { "epoch": 0.6991404011461319, "grad_norm": 5.012479782104492, "learning_rate": 1.0992405248097462e-06, "loss": 1.7554, "step": 1464 }, { "epoch": 0.6996179560649475, "grad_norm": 4.197133541107178, "learning_rate": 1.0960391726319605e-06, "loss": 1.3236, "step": 1465 }, { "epoch": 0.7000955109837631, "grad_norm": 4.568011283874512, "learning_rate": 1.0928411796459826e-06, "loss": 1.8122, "step": 1466 }, { "epoch": 0.7005730659025788, "grad_norm": 4.416896343231201, "learning_rate": 1.0896465535035015e-06, "loss": 1.651, "step": 1467 }, { "epoch": 0.7010506208213945, "grad_norm": 3.819755792617798, "learning_rate": 1.0864553018481513e-06, "loss": 1.8698, "step": 1468 }, { "epoch": 0.7015281757402101, "grad_norm": 5.0370025634765625, "learning_rate": 1.0832674323154894e-06, "loss": 1.1759, "step": 1469 }, { "epoch": 0.7020057306590258, "grad_norm": 5.394114017486572, "learning_rate": 1.0800829525329851e-06, "loss": 0.8247, "step": 1470 }, { "epoch": 0.7024832855778415, "grad_norm": 3.716905117034912, "learning_rate": 1.0769018701199943e-06, "loss": 1.3513, "step": 1471 }, { "epoch": 0.7029608404966571, "grad_norm": 4.491416931152344, "learning_rate": 1.0737241926877457e-06, "loss": 1.7576, "step": 1472 }, { "epoch": 0.7034383954154728, "grad_norm": 4.9028544425964355, "learning_rate": 1.0705499278393193e-06, "loss": 1.4646, "step": 1473 }, { "epoch": 0.7039159503342884, "grad_norm": 4.973870277404785, "learning_rate": 1.0673790831696323e-06, "loss": 1.2356, "step": 1474 }, { "epoch": 0.7043935052531041, "grad_norm": 5.244875907897949, "learning_rate": 1.0642116662654172e-06, "loss": 1.2848, "step": 1475 }, { "epoch": 0.7048710601719198, "grad_norm": 4.285227298736572, "learning_rate": 1.061047684705204e-06, "loss": 0.9779, "step": 1476 }, { "epoch": 0.7053486150907354, "grad_norm": 3.8212080001831055, "learning_rate": 1.0578871460593048e-06, "loss": 1.0223, "step": 1477 }, { "epoch": 0.7058261700095511, "grad_norm": 4.8436055183410645, "learning_rate": 1.0547300578897936e-06, "loss": 0.9373, "step": 1478 }, { "epoch": 0.7063037249283668, "grad_norm": 4.620527744293213, "learning_rate": 1.0515764277504887e-06, "loss": 1.3182, "step": 1479 }, { "epoch": 0.7067812798471824, "grad_norm": 5.405435562133789, "learning_rate": 1.0484262631869343e-06, "loss": 1.808, "step": 1480 }, { "epoch": 0.707258834765998, "grad_norm": 4.6564249992370605, "learning_rate": 1.0452795717363826e-06, "loss": 1.483, "step": 1481 }, { "epoch": 0.7077363896848138, "grad_norm": 4.656008720397949, "learning_rate": 1.0421363609277756e-06, "loss": 2.0519, "step": 1482 }, { "epoch": 0.7082139446036294, "grad_norm": 5.509772300720215, "learning_rate": 1.0389966382817285e-06, "loss": 1.5827, "step": 1483 }, { "epoch": 0.708691499522445, "grad_norm": 5.708768367767334, "learning_rate": 1.03586041131051e-06, "loss": 1.1779, "step": 1484 }, { "epoch": 0.7091690544412608, "grad_norm": 4.705165386199951, "learning_rate": 1.0327276875180232e-06, "loss": 0.9783, "step": 1485 }, { "epoch": 0.7096466093600764, "grad_norm": 5.039867877960205, "learning_rate": 1.0295984743997911e-06, "loss": 1.1026, "step": 1486 }, { "epoch": 0.710124164278892, "grad_norm": 4.480222702026367, "learning_rate": 1.026472779442937e-06, "loss": 1.181, "step": 1487 }, { "epoch": 0.7106017191977078, "grad_norm": 4.686654090881348, "learning_rate": 1.0233506101261656e-06, "loss": 0.772, "step": 1488 }, { "epoch": 0.7110792741165234, "grad_norm": 4.707120895385742, "learning_rate": 1.020231973919746e-06, "loss": 1.2349, "step": 1489 }, { "epoch": 0.711556829035339, "grad_norm": 5.109123229980469, "learning_rate": 1.0171168782854948e-06, "loss": 1.6251, "step": 1490 }, { "epoch": 0.7120343839541547, "grad_norm": 4.417863845825195, "learning_rate": 1.014005330676756e-06, "loss": 1.8463, "step": 1491 }, { "epoch": 0.7125119388729704, "grad_norm": 5.041341781616211, "learning_rate": 1.0108973385383852e-06, "loss": 1.0521, "step": 1492 }, { "epoch": 0.7129894937917861, "grad_norm": 4.591705799102783, "learning_rate": 1.0077929093067313e-06, "loss": 1.4454, "step": 1493 }, { "epoch": 0.7134670487106017, "grad_norm": 5.01251220703125, "learning_rate": 1.0046920504096163e-06, "loss": 1.1288, "step": 1494 }, { "epoch": 0.7139446036294174, "grad_norm": 5.022948265075684, "learning_rate": 1.001594769266322e-06, "loss": 1.1697, "step": 1495 }, { "epoch": 0.7144221585482331, "grad_norm": 4.472610950469971, "learning_rate": 9.985010732875686e-07, "loss": 1.2016, "step": 1496 }, { "epoch": 0.7148997134670487, "grad_norm": 4.808659076690674, "learning_rate": 9.954109698754993e-07, "loss": 1.8441, "step": 1497 }, { "epoch": 0.7153772683858644, "grad_norm": 5.014040470123291, "learning_rate": 9.923244664236603e-07, "loss": 0.9167, "step": 1498 }, { "epoch": 0.7158548233046801, "grad_norm": 4.599469184875488, "learning_rate": 9.892415703169856e-07, "loss": 1.4527, "step": 1499 }, { "epoch": 0.7163323782234957, "grad_norm": 4.6738152503967285, "learning_rate": 9.861622889317769e-07, "loss": 1.5674, "step": 1500 }, { "epoch": 0.7168099331423113, "grad_norm": 4.210339069366455, "learning_rate": 9.830866296356875e-07, "loss": 1.7236, "step": 1501 }, { "epoch": 0.7172874880611271, "grad_norm": 4.880158424377441, "learning_rate": 9.800145997877047e-07, "loss": 1.2149, "step": 1502 }, { "epoch": 0.7177650429799427, "grad_norm": 4.30711555480957, "learning_rate": 9.769462067381327e-07, "loss": 1.6535, "step": 1503 }, { "epoch": 0.7182425978987583, "grad_norm": 4.5115790367126465, "learning_rate": 9.738814578285705e-07, "loss": 1.7221, "step": 1504 }, { "epoch": 0.7187201528175741, "grad_norm": 4.371654033660889, "learning_rate": 9.708203603919016e-07, "loss": 1.3543, "step": 1505 }, { "epoch": 0.7191977077363897, "grad_norm": 4.336892127990723, "learning_rate": 9.677629217522717e-07, "loss": 1.6696, "step": 1506 }, { "epoch": 0.7196752626552053, "grad_norm": 5.033395290374756, "learning_rate": 9.647091492250716e-07, "loss": 1.4511, "step": 1507 }, { "epoch": 0.720152817574021, "grad_norm": 4.743730545043945, "learning_rate": 9.616590501169212e-07, "loss": 1.9581, "step": 1508 }, { "epoch": 0.7206303724928367, "grad_norm": 3.999676465988159, "learning_rate": 9.586126317256512e-07, "loss": 0.7151, "step": 1509 }, { "epoch": 0.7211079274116523, "grad_norm": 3.999169111251831, "learning_rate": 9.555699013402845e-07, "loss": 0.9818, "step": 1510 }, { "epoch": 0.721585482330468, "grad_norm": 5.072868347167969, "learning_rate": 9.525308662410209e-07, "loss": 1.8876, "step": 1511 }, { "epoch": 0.7220630372492837, "grad_norm": 4.083013534545898, "learning_rate": 9.494955336992187e-07, "loss": 1.6371, "step": 1512 }, { "epoch": 0.7225405921680993, "grad_norm": 4.039516448974609, "learning_rate": 9.464639109773768e-07, "loss": 1.3366, "step": 1513 }, { "epoch": 0.723018147086915, "grad_norm": 3.573850154876709, "learning_rate": 9.43436005329118e-07, "loss": 1.3131, "step": 1514 }, { "epoch": 0.7234957020057307, "grad_norm": 4.1265997886657715, "learning_rate": 9.404118239991713e-07, "loss": 2.1315, "step": 1515 }, { "epoch": 0.7239732569245463, "grad_norm": 4.2829976081848145, "learning_rate": 9.37391374223355e-07, "loss": 1.8102, "step": 1516 }, { "epoch": 0.724450811843362, "grad_norm": 4.247711658477783, "learning_rate": 9.343746632285588e-07, "loss": 1.6789, "step": 1517 }, { "epoch": 0.7249283667621776, "grad_norm": 4.519783973693848, "learning_rate": 9.313616982327264e-07, "loss": 1.9935, "step": 1518 }, { "epoch": 0.7254059216809933, "grad_norm": 4.436924934387207, "learning_rate": 9.283524864448406e-07, "loss": 1.0137, "step": 1519 }, { "epoch": 0.725883476599809, "grad_norm": 3.8087387084960938, "learning_rate": 9.253470350649007e-07, "loss": 1.1474, "step": 1520 }, { "epoch": 0.7263610315186246, "grad_norm": 4.623415470123291, "learning_rate": 9.223453512839109e-07, "loss": 1.3386, "step": 1521 }, { "epoch": 0.7268385864374403, "grad_norm": 4.818606376647949, "learning_rate": 9.19347442283861e-07, "loss": 1.1169, "step": 1522 }, { "epoch": 0.727316141356256, "grad_norm": 4.151885509490967, "learning_rate": 9.163533152377082e-07, "loss": 1.4354, "step": 1523 }, { "epoch": 0.7277936962750716, "grad_norm": 4.481689453125, "learning_rate": 9.133629773093614e-07, "loss": 2.1068, "step": 1524 }, { "epoch": 0.7282712511938872, "grad_norm": 3.5279688835144043, "learning_rate": 9.103764356536626e-07, "loss": 1.5545, "step": 1525 }, { "epoch": 0.728748806112703, "grad_norm": 4.237649917602539, "learning_rate": 9.073936974163716e-07, "loss": 1.2988, "step": 1526 }, { "epoch": 0.7292263610315186, "grad_norm": 3.62979793548584, "learning_rate": 9.044147697341477e-07, "loss": 1.6422, "step": 1527 }, { "epoch": 0.7297039159503342, "grad_norm": 5.13606595993042, "learning_rate": 9.014396597345331e-07, "loss": 2.0696, "step": 1528 }, { "epoch": 0.73018147086915, "grad_norm": 5.3140034675598145, "learning_rate": 8.984683745359337e-07, "loss": 1.1765, "step": 1529 }, { "epoch": 0.7306590257879656, "grad_norm": 4.788644313812256, "learning_rate": 8.955009212476063e-07, "loss": 1.555, "step": 1530 }, { "epoch": 0.7311365807067812, "grad_norm": 4.193361282348633, "learning_rate": 8.925373069696386e-07, "loss": 2.11, "step": 1531 }, { "epoch": 0.731614135625597, "grad_norm": 4.870258331298828, "learning_rate": 8.895775387929323e-07, "loss": 1.1273, "step": 1532 }, { "epoch": 0.7320916905444126, "grad_norm": 5.236242294311523, "learning_rate": 8.866216237991875e-07, "loss": 1.2166, "step": 1533 }, { "epoch": 0.7325692454632283, "grad_norm": 4.146707057952881, "learning_rate": 8.836695690608846e-07, "loss": 1.397, "step": 1534 }, { "epoch": 0.733046800382044, "grad_norm": 5.782552719116211, "learning_rate": 8.807213816412674e-07, "loss": 2.1707, "step": 1535 }, { "epoch": 0.7335243553008596, "grad_norm": 5.384231090545654, "learning_rate": 8.777770685943274e-07, "loss": 1.2756, "step": 1536 }, { "epoch": 0.7340019102196753, "grad_norm": 3.801945209503174, "learning_rate": 8.748366369647862e-07, "loss": 0.9415, "step": 1537 }, { "epoch": 0.7344794651384909, "grad_norm": 4.740548133850098, "learning_rate": 8.719000937880758e-07, "loss": 1.413, "step": 1538 }, { "epoch": 0.7349570200573066, "grad_norm": 4.9530110359191895, "learning_rate": 8.68967446090328e-07, "loss": 1.2405, "step": 1539 }, { "epoch": 0.7354345749761223, "grad_norm": 4.895037651062012, "learning_rate": 8.66038700888352e-07, "loss": 1.5504, "step": 1540 }, { "epoch": 0.7359121298949379, "grad_norm": 6.102834701538086, "learning_rate": 8.63113865189621e-07, "loss": 1.2938, "step": 1541 }, { "epoch": 0.7363896848137536, "grad_norm": 5.350526332855225, "learning_rate": 8.601929459922528e-07, "loss": 1.3867, "step": 1542 }, { "epoch": 0.7368672397325693, "grad_norm": 4.127810001373291, "learning_rate": 8.572759502849953e-07, "loss": 0.922, "step": 1543 }, { "epoch": 0.7373447946513849, "grad_norm": 3.714482069015503, "learning_rate": 8.543628850472085e-07, "loss": 1.2161, "step": 1544 }, { "epoch": 0.7378223495702005, "grad_norm": 4.209648132324219, "learning_rate": 8.514537572488479e-07, "loss": 1.8269, "step": 1545 }, { "epoch": 0.7382999044890163, "grad_norm": 5.242568492889404, "learning_rate": 8.48548573850449e-07, "loss": 1.249, "step": 1546 }, { "epoch": 0.7387774594078319, "grad_norm": 7.4550251960754395, "learning_rate": 8.456473418031091e-07, "loss": 1.5071, "step": 1547 }, { "epoch": 0.7392550143266475, "grad_norm": 5.021182060241699, "learning_rate": 8.427500680484704e-07, "loss": 1.6093, "step": 1548 }, { "epoch": 0.7397325692454633, "grad_norm": 4.546686172485352, "learning_rate": 8.398567595187051e-07, "loss": 1.1493, "step": 1549 }, { "epoch": 0.7402101241642789, "grad_norm": 6.3026885986328125, "learning_rate": 8.369674231364988e-07, "loss": 1.2838, "step": 1550 }, { "epoch": 0.7406876790830945, "grad_norm": 4.062827110290527, "learning_rate": 8.340820658150317e-07, "loss": 1.2646, "step": 1551 }, { "epoch": 0.7411652340019103, "grad_norm": 3.952110767364502, "learning_rate": 8.312006944579648e-07, "loss": 1.3326, "step": 1552 }, { "epoch": 0.7416427889207259, "grad_norm": 4.022990703582764, "learning_rate": 8.283233159594209e-07, "loss": 1.529, "step": 1553 }, { "epoch": 0.7421203438395415, "grad_norm": 5.089108467102051, "learning_rate": 8.254499372039698e-07, "loss": 1.9838, "step": 1554 }, { "epoch": 0.7425978987583572, "grad_norm": 4.606695652008057, "learning_rate": 8.225805650666116e-07, "loss": 1.3077, "step": 1555 }, { "epoch": 0.7430754536771729, "grad_norm": 4.6830573081970215, "learning_rate": 8.197152064127592e-07, "loss": 1.3504, "step": 1556 }, { "epoch": 0.7435530085959885, "grad_norm": 4.696037769317627, "learning_rate": 8.168538680982235e-07, "loss": 2.2137, "step": 1557 }, { "epoch": 0.7440305635148042, "grad_norm": 4.799436569213867, "learning_rate": 8.139965569691955e-07, "loss": 1.3653, "step": 1558 }, { "epoch": 0.7445081184336199, "grad_norm": 4.6056647300720215, "learning_rate": 8.111432798622304e-07, "loss": 1.7909, "step": 1559 }, { "epoch": 0.7449856733524355, "grad_norm": 5.008808135986328, "learning_rate": 8.082940436042322e-07, "loss": 1.7379, "step": 1560 }, { "epoch": 0.7454632282712512, "grad_norm": 3.5271825790405273, "learning_rate": 8.05448855012436e-07, "loss": 1.5462, "step": 1561 }, { "epoch": 0.7459407831900668, "grad_norm": 4.470743179321289, "learning_rate": 8.026077208943916e-07, "loss": 1.6501, "step": 1562 }, { "epoch": 0.7464183381088825, "grad_norm": 4.895837783813477, "learning_rate": 7.997706480479503e-07, "loss": 1.063, "step": 1563 }, { "epoch": 0.7468958930276982, "grad_norm": 3.992450475692749, "learning_rate": 7.969376432612419e-07, "loss": 1.3026, "step": 1564 }, { "epoch": 0.7473734479465138, "grad_norm": 4.861988067626953, "learning_rate": 7.941087133126669e-07, "loss": 0.9077, "step": 1565 }, { "epoch": 0.7478510028653295, "grad_norm": 5.169985771179199, "learning_rate": 7.91283864970874e-07, "loss": 1.7227, "step": 1566 }, { "epoch": 0.7483285577841452, "grad_norm": 4.899754047393799, "learning_rate": 7.884631049947463e-07, "loss": 1.5294, "step": 1567 }, { "epoch": 0.7488061127029608, "grad_norm": 4.892877578735352, "learning_rate": 7.856464401333857e-07, "loss": 0.8967, "step": 1568 }, { "epoch": 0.7492836676217765, "grad_norm": 4.563969135284424, "learning_rate": 7.828338771260948e-07, "loss": 1.1261, "step": 1569 }, { "epoch": 0.7497612225405922, "grad_norm": 5.472959995269775, "learning_rate": 7.800254227023629e-07, "loss": 1.9644, "step": 1570 }, { "epoch": 0.7502387774594078, "grad_norm": 5.182770729064941, "learning_rate": 7.772210835818475e-07, "loss": 1.4513, "step": 1571 }, { "epoch": 0.7507163323782235, "grad_norm": 4.7353291511535645, "learning_rate": 7.744208664743619e-07, "loss": 0.8022, "step": 1572 }, { "epoch": 0.7511938872970392, "grad_norm": 4.326401710510254, "learning_rate": 7.716247780798535e-07, "loss": 1.0387, "step": 1573 }, { "epoch": 0.7516714422158548, "grad_norm": 5.534078121185303, "learning_rate": 7.688328250883941e-07, "loss": 2.3541, "step": 1574 }, { "epoch": 0.7521489971346705, "grad_norm": 4.415371417999268, "learning_rate": 7.660450141801599e-07, "loss": 1.4296, "step": 1575 }, { "epoch": 0.7526265520534862, "grad_norm": 4.182026386260986, "learning_rate": 7.632613520254159e-07, "loss": 1.5142, "step": 1576 }, { "epoch": 0.7531041069723018, "grad_norm": 4.76546573638916, "learning_rate": 7.604818452845014e-07, "loss": 1.4418, "step": 1577 }, { "epoch": 0.7535816618911175, "grad_norm": 4.731568813323975, "learning_rate": 7.577065006078133e-07, "loss": 1.2695, "step": 1578 }, { "epoch": 0.7540592168099332, "grad_norm": 4.328670501708984, "learning_rate": 7.549353246357896e-07, "loss": 0.8022, "step": 1579 }, { "epoch": 0.7545367717287488, "grad_norm": 5.172923564910889, "learning_rate": 7.521683239988939e-07, "loss": 0.93, "step": 1580 }, { "epoch": 0.7550143266475645, "grad_norm": 5.530806541442871, "learning_rate": 7.494055053176014e-07, "loss": 1.3518, "step": 1581 }, { "epoch": 0.7554918815663801, "grad_norm": 4.717085361480713, "learning_rate": 7.466468752023778e-07, "loss": 1.1869, "step": 1582 }, { "epoch": 0.7559694364851958, "grad_norm": 4.688042163848877, "learning_rate": 7.438924402536702e-07, "loss": 0.844, "step": 1583 }, { "epoch": 0.7564469914040115, "grad_norm": 5.805454254150391, "learning_rate": 7.411422070618868e-07, "loss": 1.3697, "step": 1584 }, { "epoch": 0.7569245463228271, "grad_norm": 3.9898247718811035, "learning_rate": 7.383961822073832e-07, "loss": 1.3409, "step": 1585 }, { "epoch": 0.7574021012416428, "grad_norm": 4.2808756828308105, "learning_rate": 7.35654372260445e-07, "loss": 1.2413, "step": 1586 }, { "epoch": 0.7578796561604585, "grad_norm": 4.584319114685059, "learning_rate": 7.329167837812737e-07, "loss": 1.7374, "step": 1587 }, { "epoch": 0.7583572110792741, "grad_norm": 3.8650591373443604, "learning_rate": 7.301834233199698e-07, "loss": 1.2591, "step": 1588 }, { "epoch": 0.7588347659980897, "grad_norm": 4.255753517150879, "learning_rate": 7.274542974165177e-07, "loss": 0.9004, "step": 1589 }, { "epoch": 0.7593123209169055, "grad_norm": 5.122644901275635, "learning_rate": 7.247294126007704e-07, "loss": 1.6909, "step": 1590 }, { "epoch": 0.7597898758357211, "grad_norm": 4.647463321685791, "learning_rate": 7.220087753924334e-07, "loss": 1.3089, "step": 1591 }, { "epoch": 0.7602674307545367, "grad_norm": 3.800870418548584, "learning_rate": 7.192923923010481e-07, "loss": 1.195, "step": 1592 }, { "epoch": 0.7607449856733525, "grad_norm": 4.687005519866943, "learning_rate": 7.16580269825978e-07, "loss": 1.6943, "step": 1593 }, { "epoch": 0.7612225405921681, "grad_norm": 3.6567559242248535, "learning_rate": 7.138724144563933e-07, "loss": 0.7733, "step": 1594 }, { "epoch": 0.7617000955109837, "grad_norm": 4.723353385925293, "learning_rate": 7.111688326712529e-07, "loss": 2.0925, "step": 1595 }, { "epoch": 0.7621776504297995, "grad_norm": 4.447376728057861, "learning_rate": 7.084695309392916e-07, "loss": 1.2798, "step": 1596 }, { "epoch": 0.7626552053486151, "grad_norm": 5.576816082000732, "learning_rate": 7.057745157190032e-07, "loss": 1.8017, "step": 1597 }, { "epoch": 0.7631327602674307, "grad_norm": 5.947661876678467, "learning_rate": 7.030837934586254e-07, "loss": 1.014, "step": 1598 }, { "epoch": 0.7636103151862464, "grad_norm": 5.113620281219482, "learning_rate": 7.003973705961242e-07, "loss": 2.0231, "step": 1599 }, { "epoch": 0.7640878701050621, "grad_norm": 5.216801166534424, "learning_rate": 6.977152535591786e-07, "loss": 1.6055, "step": 1600 }, { "epoch": 0.7645654250238777, "grad_norm": 4.786805152893066, "learning_rate": 6.950374487651654e-07, "loss": 1.0622, "step": 1601 }, { "epoch": 0.7650429799426934, "grad_norm": 4.163427829742432, "learning_rate": 6.923639626211437e-07, "loss": 1.698, "step": 1602 }, { "epoch": 0.7655205348615091, "grad_norm": 3.5758397579193115, "learning_rate": 6.896948015238397e-07, "loss": 1.0622, "step": 1603 }, { "epoch": 0.7659980897803247, "grad_norm": 4.483384132385254, "learning_rate": 6.870299718596307e-07, "loss": 2.1517, "step": 1604 }, { "epoch": 0.7664756446991404, "grad_norm": 4.212930202484131, "learning_rate": 6.843694800045309e-07, "loss": 0.7799, "step": 1605 }, { "epoch": 0.766953199617956, "grad_norm": 7.407425880432129, "learning_rate": 6.817133323241757e-07, "loss": 1.5184, "step": 1606 }, { "epoch": 0.7674307545367717, "grad_norm": 4.783387184143066, "learning_rate": 6.790615351738064e-07, "loss": 1.9077, "step": 1607 }, { "epoch": 0.7679083094555874, "grad_norm": 3.754189968109131, "learning_rate": 6.76414094898254e-07, "loss": 1.276, "step": 1608 }, { "epoch": 0.768385864374403, "grad_norm": 4.8138532638549805, "learning_rate": 6.737710178319259e-07, "loss": 0.7626, "step": 1609 }, { "epoch": 0.7688634192932188, "grad_norm": 4.742074489593506, "learning_rate": 6.711323102987901e-07, "loss": 1.8808, "step": 1610 }, { "epoch": 0.7693409742120344, "grad_norm": 3.9724607467651367, "learning_rate": 6.684979786123596e-07, "loss": 1.7224, "step": 1611 }, { "epoch": 0.76981852913085, "grad_norm": 5.065790176391602, "learning_rate": 6.658680290756769e-07, "loss": 1.9428, "step": 1612 }, { "epoch": 0.7702960840496658, "grad_norm": 4.6834797859191895, "learning_rate": 6.632424679813001e-07, "loss": 1.8375, "step": 1613 }, { "epoch": 0.7707736389684814, "grad_norm": 5.136268138885498, "learning_rate": 6.606213016112875e-07, "loss": 1.3623, "step": 1614 }, { "epoch": 0.771251193887297, "grad_norm": 4.613232612609863, "learning_rate": 6.580045362371818e-07, "loss": 1.8366, "step": 1615 }, { "epoch": 0.7717287488061128, "grad_norm": 4.902231693267822, "learning_rate": 6.553921781199965e-07, "loss": 1.3598, "step": 1616 }, { "epoch": 0.7722063037249284, "grad_norm": 4.118681907653809, "learning_rate": 6.52784233510198e-07, "loss": 1.2446, "step": 1617 }, { "epoch": 0.772683858643744, "grad_norm": 5.077035427093506, "learning_rate": 6.501807086476952e-07, "loss": 1.4385, "step": 1618 }, { "epoch": 0.7731614135625597, "grad_norm": 4.566662311553955, "learning_rate": 6.475816097618206e-07, "loss": 1.5501, "step": 1619 }, { "epoch": 0.7736389684813754, "grad_norm": 5.179864406585693, "learning_rate": 6.449869430713177e-07, "loss": 1.6714, "step": 1620 }, { "epoch": 0.774116523400191, "grad_norm": 5.659124851226807, "learning_rate": 6.423967147843244e-07, "loss": 1.6005, "step": 1621 }, { "epoch": 0.7745940783190067, "grad_norm": 4.823684215545654, "learning_rate": 6.398109310983596e-07, "loss": 0.7502, "step": 1622 }, { "epoch": 0.7750716332378224, "grad_norm": 5.705247402191162, "learning_rate": 6.372295982003082e-07, "loss": 1.3125, "step": 1623 }, { "epoch": 0.775549188156638, "grad_norm": 8.578139305114746, "learning_rate": 6.346527222664047e-07, "loss": 1.2098, "step": 1624 }, { "epoch": 0.7760267430754537, "grad_norm": 4.621988296508789, "learning_rate": 6.320803094622218e-07, "loss": 1.3522, "step": 1625 }, { "epoch": 0.7765042979942693, "grad_norm": 4.939689636230469, "learning_rate": 6.295123659426503e-07, "loss": 1.0281, "step": 1626 }, { "epoch": 0.776981852913085, "grad_norm": 4.983333587646484, "learning_rate": 6.269488978518897e-07, "loss": 1.5455, "step": 1627 }, { "epoch": 0.7774594078319007, "grad_norm": 4.825756549835205, "learning_rate": 6.243899113234317e-07, "loss": 0.8094, "step": 1628 }, { "epoch": 0.7779369627507163, "grad_norm": 4.433287620544434, "learning_rate": 6.218354124800436e-07, "loss": 2.0221, "step": 1629 }, { "epoch": 0.778414517669532, "grad_norm": 4.493676662445068, "learning_rate": 6.192854074337562e-07, "loss": 1.3329, "step": 1630 }, { "epoch": 0.7788920725883477, "grad_norm": 5.016502380371094, "learning_rate": 6.167399022858484e-07, "loss": 1.6497, "step": 1631 }, { "epoch": 0.7793696275071633, "grad_norm": 4.083900451660156, "learning_rate": 6.141989031268317e-07, "loss": 1.7457, "step": 1632 }, { "epoch": 0.779847182425979, "grad_norm": 5.041534900665283, "learning_rate": 6.116624160364371e-07, "loss": 0.908, "step": 1633 }, { "epoch": 0.7803247373447947, "grad_norm": 4.197653770446777, "learning_rate": 6.091304470835988e-07, "loss": 0.9673, "step": 1634 }, { "epoch": 0.7808022922636103, "grad_norm": 5.202861309051514, "learning_rate": 6.066030023264422e-07, "loss": 1.7509, "step": 1635 }, { "epoch": 0.7812798471824259, "grad_norm": 3.8662710189819336, "learning_rate": 6.040800878122655e-07, "loss": 1.7202, "step": 1636 }, { "epoch": 0.7817574021012417, "grad_norm": 4.5169219970703125, "learning_rate": 6.015617095775295e-07, "loss": 1.4608, "step": 1637 }, { "epoch": 0.7822349570200573, "grad_norm": 4.089737415313721, "learning_rate": 5.990478736478409e-07, "loss": 0.9499, "step": 1638 }, { "epoch": 0.7827125119388729, "grad_norm": 5.610507488250732, "learning_rate": 5.96538586037938e-07, "loss": 1.1364, "step": 1639 }, { "epoch": 0.7831900668576887, "grad_norm": 3.9222373962402344, "learning_rate": 5.940338527516768e-07, "loss": 0.7614, "step": 1640 }, { "epoch": 0.7836676217765043, "grad_norm": 3.933983564376831, "learning_rate": 5.915336797820159e-07, "loss": 1.8465, "step": 1641 }, { "epoch": 0.7841451766953199, "grad_norm": 4.9664530754089355, "learning_rate": 5.890380731110032e-07, "loss": 1.5264, "step": 1642 }, { "epoch": 0.7846227316141356, "grad_norm": 5.167954444885254, "learning_rate": 5.865470387097605e-07, "loss": 1.1984, "step": 1643 }, { "epoch": 0.7851002865329513, "grad_norm": 5.391510486602783, "learning_rate": 5.840605825384704e-07, "loss": 1.6073, "step": 1644 }, { "epoch": 0.7855778414517669, "grad_norm": 3.933400869369507, "learning_rate": 5.815787105463607e-07, "loss": 1.0387, "step": 1645 }, { "epoch": 0.7860553963705826, "grad_norm": 4.077244758605957, "learning_rate": 5.791014286716912e-07, "loss": 1.1196, "step": 1646 }, { "epoch": 0.7865329512893983, "grad_norm": 4.785174369812012, "learning_rate": 5.766287428417392e-07, "loss": 1.2598, "step": 1647 }, { "epoch": 0.7870105062082139, "grad_norm": 5.108158588409424, "learning_rate": 5.741606589727847e-07, "loss": 1.0809, "step": 1648 }, { "epoch": 0.7874880611270296, "grad_norm": 5.594452857971191, "learning_rate": 5.71697182970098e-07, "loss": 1.211, "step": 1649 }, { "epoch": 0.7879656160458453, "grad_norm": 4.789851188659668, "learning_rate": 5.692383207279228e-07, "loss": 2.2221, "step": 1650 }, { "epoch": 0.788443170964661, "grad_norm": 4.373819351196289, "learning_rate": 5.667840781294659e-07, "loss": 1.7461, "step": 1651 }, { "epoch": 0.7889207258834766, "grad_norm": 4.592148780822754, "learning_rate": 5.643344610468776e-07, "loss": 1.1277, "step": 1652 }, { "epoch": 0.7893982808022922, "grad_norm": 4.616485595703125, "learning_rate": 5.618894753412438e-07, "loss": 1.7971, "step": 1653 }, { "epoch": 0.789875835721108, "grad_norm": 4.634642601013184, "learning_rate": 5.59449126862568e-07, "loss": 1.3743, "step": 1654 }, { "epoch": 0.7903533906399236, "grad_norm": 4.453817367553711, "learning_rate": 5.570134214497585e-07, "loss": 1.2915, "step": 1655 }, { "epoch": 0.7908309455587392, "grad_norm": 4.887211322784424, "learning_rate": 5.545823649306145e-07, "loss": 1.724, "step": 1656 }, { "epoch": 0.791308500477555, "grad_norm": 4.89791202545166, "learning_rate": 5.52155963121812e-07, "loss": 1.2349, "step": 1657 }, { "epoch": 0.7917860553963706, "grad_norm": 5.789148330688477, "learning_rate": 5.497342218288898e-07, "loss": 1.5348, "step": 1658 }, { "epoch": 0.7922636103151862, "grad_norm": 4.298985958099365, "learning_rate": 5.473171468462354e-07, "loss": 1.6805, "step": 1659 }, { "epoch": 0.792741165234002, "grad_norm": 4.651891708374023, "learning_rate": 5.449047439570729e-07, "loss": 1.3613, "step": 1660 }, { "epoch": 0.7932187201528176, "grad_norm": 5.180197238922119, "learning_rate": 5.42497018933445e-07, "loss": 2.0539, "step": 1661 }, { "epoch": 0.7936962750716332, "grad_norm": 4.6201066970825195, "learning_rate": 5.40093977536204e-07, "loss": 1.3728, "step": 1662 }, { "epoch": 0.7941738299904489, "grad_norm": 4.392631530761719, "learning_rate": 5.376956255149957e-07, "loss": 1.8137, "step": 1663 }, { "epoch": 0.7946513849092646, "grad_norm": 4.740032196044922, "learning_rate": 5.35301968608245e-07, "loss": 1.7207, "step": 1664 }, { "epoch": 0.7951289398280802, "grad_norm": 4.94846773147583, "learning_rate": 5.329130125431437e-07, "loss": 1.1682, "step": 1665 }, { "epoch": 0.7956064947468959, "grad_norm": 5.055766582489014, "learning_rate": 5.305287630356363e-07, "loss": 1.131, "step": 1666 }, { "epoch": 0.7960840496657116, "grad_norm": 4.685311317443848, "learning_rate": 5.28149225790405e-07, "loss": 1.3918, "step": 1667 }, { "epoch": 0.7965616045845272, "grad_norm": 5.187441349029541, "learning_rate": 5.257744065008585e-07, "loss": 1.0385, "step": 1668 }, { "epoch": 0.7970391595033429, "grad_norm": 4.361275672912598, "learning_rate": 5.234043108491174e-07, "loss": 1.5097, "step": 1669 }, { "epoch": 0.7975167144221585, "grad_norm": 4.36018180847168, "learning_rate": 5.210389445059974e-07, "loss": 1.9649, "step": 1670 }, { "epoch": 0.7979942693409742, "grad_norm": 5.459512233734131, "learning_rate": 5.186783131310016e-07, "loss": 1.4704, "step": 1671 }, { "epoch": 0.7984718242597899, "grad_norm": 4.589227199554443, "learning_rate": 5.163224223723032e-07, "loss": 1.0878, "step": 1672 }, { "epoch": 0.7989493791786055, "grad_norm": 4.509504795074463, "learning_rate": 5.139712778667319e-07, "loss": 1.4135, "step": 1673 }, { "epoch": 0.7994269340974212, "grad_norm": 4.303913593292236, "learning_rate": 5.116248852397626e-07, "loss": 1.7786, "step": 1674 }, { "epoch": 0.7999044890162369, "grad_norm": 4.177727699279785, "learning_rate": 5.092832501054993e-07, "loss": 1.5672, "step": 1675 }, { "epoch": 0.8003820439350525, "grad_norm": 5.086904525756836, "learning_rate": 5.069463780666637e-07, "loss": 1.2394, "step": 1676 }, { "epoch": 0.8008595988538681, "grad_norm": 4.692572593688965, "learning_rate": 5.046142747145807e-07, "loss": 1.2696, "step": 1677 }, { "epoch": 0.8013371537726839, "grad_norm": 4.475303649902344, "learning_rate": 5.022869456291659e-07, "loss": 2.0579, "step": 1678 }, { "epoch": 0.8018147086914995, "grad_norm": 4.484958171844482, "learning_rate": 4.999643963789122e-07, "loss": 2.0833, "step": 1679 }, { "epoch": 0.8022922636103151, "grad_norm": 4.41753625869751, "learning_rate": 4.976466325208737e-07, "loss": 1.3981, "step": 1680 }, { "epoch": 0.8027698185291309, "grad_norm": 4.374972343444824, "learning_rate": 4.953336596006566e-07, "loss": 1.3141, "step": 1681 }, { "epoch": 0.8032473734479465, "grad_norm": 4.444657802581787, "learning_rate": 4.93025483152404e-07, "loss": 2.2327, "step": 1682 }, { "epoch": 0.8037249283667621, "grad_norm": 4.911617279052734, "learning_rate": 4.907221086987823e-07, "loss": 1.3881, "step": 1683 }, { "epoch": 0.8042024832855779, "grad_norm": 4.757000923156738, "learning_rate": 4.884235417509686e-07, "loss": 1.3617, "step": 1684 }, { "epoch": 0.8046800382043935, "grad_norm": 4.1789631843566895, "learning_rate": 4.861297878086366e-07, "loss": 2.0872, "step": 1685 }, { "epoch": 0.8051575931232091, "grad_norm": 4.237489700317383, "learning_rate": 4.838408523599464e-07, "loss": 1.5154, "step": 1686 }, { "epoch": 0.8056351480420249, "grad_norm": 4.859968185424805, "learning_rate": 4.815567408815258e-07, "loss": 0.9238, "step": 1687 }, { "epoch": 0.8061127029608405, "grad_norm": 5.114818572998047, "learning_rate": 4.792774588384624e-07, "loss": 1.2171, "step": 1688 }, { "epoch": 0.8065902578796562, "grad_norm": 4.10369873046875, "learning_rate": 4.770030116842889e-07, "loss": 0.7367, "step": 1689 }, { "epoch": 0.8070678127984718, "grad_norm": 4.849875450134277, "learning_rate": 4.747334048609689e-07, "loss": 1.6158, "step": 1690 }, { "epoch": 0.8075453677172875, "grad_norm": 4.691784858703613, "learning_rate": 4.7246864379888516e-07, "loss": 1.4494, "step": 1691 }, { "epoch": 0.8080229226361032, "grad_norm": 4.634671688079834, "learning_rate": 4.7020873391682654e-07, "loss": 2.0754, "step": 1692 }, { "epoch": 0.8085004775549188, "grad_norm": 4.440530300140381, "learning_rate": 4.679536806219739e-07, "loss": 1.396, "step": 1693 }, { "epoch": 0.8089780324737345, "grad_norm": 4.321910858154297, "learning_rate": 4.657034893098886e-07, "loss": 1.265, "step": 1694 }, { "epoch": 0.8094555873925502, "grad_norm": 5.179622173309326, "learning_rate": 4.6345816536449957e-07, "loss": 1.1783, "step": 1695 }, { "epoch": 0.8099331423113658, "grad_norm": 4.186709880828857, "learning_rate": 4.612177141580876e-07, "loss": 1.0064, "step": 1696 }, { "epoch": 0.8104106972301814, "grad_norm": 3.633190155029297, "learning_rate": 4.589821410512771e-07, "loss": 0.7724, "step": 1697 }, { "epoch": 0.8108882521489972, "grad_norm": 4.224778652191162, "learning_rate": 4.5675145139301935e-07, "loss": 1.2286, "step": 1698 }, { "epoch": 0.8113658070678128, "grad_norm": 4.481283187866211, "learning_rate": 4.5452565052058224e-07, "loss": 1.2375, "step": 1699 }, { "epoch": 0.8118433619866284, "grad_norm": 4.748173236846924, "learning_rate": 4.5230474375953597e-07, "loss": 1.7692, "step": 1700 }, { "epoch": 0.8123209169054442, "grad_norm": 4.822946071624756, "learning_rate": 4.500887364237408e-07, "loss": 1.5156, "step": 1701 }, { "epoch": 0.8127984718242598, "grad_norm": 4.8066887855529785, "learning_rate": 4.4787763381533454e-07, "loss": 1.7394, "step": 1702 }, { "epoch": 0.8132760267430754, "grad_norm": 4.495865345001221, "learning_rate": 4.4567144122471957e-07, "loss": 1.3978, "step": 1703 }, { "epoch": 0.8137535816618912, "grad_norm": 4.854004383087158, "learning_rate": 4.434701639305511e-07, "loss": 1.5606, "step": 1704 }, { "epoch": 0.8142311365807068, "grad_norm": 4.745909214019775, "learning_rate": 4.412738071997216e-07, "loss": 1.9095, "step": 1705 }, { "epoch": 0.8147086914995224, "grad_norm": 4.480419158935547, "learning_rate": 4.390823762873525e-07, "loss": 1.7343, "step": 1706 }, { "epoch": 0.8151862464183381, "grad_norm": 4.937084197998047, "learning_rate": 4.368958764367784e-07, "loss": 1.0092, "step": 1707 }, { "epoch": 0.8156638013371538, "grad_norm": 4.269734859466553, "learning_rate": 4.347143128795361e-07, "loss": 1.5898, "step": 1708 }, { "epoch": 0.8161413562559694, "grad_norm": 4.5274739265441895, "learning_rate": 4.325376908353512e-07, "loss": 1.3496, "step": 1709 }, { "epoch": 0.8166189111747851, "grad_norm": 5.184662342071533, "learning_rate": 4.3036601551212613e-07, "loss": 1.304, "step": 1710 }, { "epoch": 0.8170964660936008, "grad_norm": 6.661823749542236, "learning_rate": 4.2819929210592746e-07, "loss": 1.5495, "step": 1711 }, { "epoch": 0.8175740210124164, "grad_norm": 5.403781890869141, "learning_rate": 4.2603752580097356e-07, "loss": 0.9967, "step": 1712 }, { "epoch": 0.8180515759312321, "grad_norm": 4.371469974517822, "learning_rate": 4.238807217696228e-07, "loss": 1.3868, "step": 1713 }, { "epoch": 0.8185291308500477, "grad_norm": 4.482807159423828, "learning_rate": 4.217288851723586e-07, "loss": 2.0448, "step": 1714 }, { "epoch": 0.8190066857688634, "grad_norm": 3.889740228652954, "learning_rate": 4.1958202115778114e-07, "loss": 1.7113, "step": 1715 }, { "epoch": 0.8194842406876791, "grad_norm": 4.85420036315918, "learning_rate": 4.174401348625923e-07, "loss": 1.6651, "step": 1716 }, { "epoch": 0.8199617956064947, "grad_norm": 5.075958251953125, "learning_rate": 4.1530323141158346e-07, "loss": 1.434, "step": 1717 }, { "epoch": 0.8204393505253104, "grad_norm": 5.00612211227417, "learning_rate": 4.131713159176243e-07, "loss": 1.7957, "step": 1718 }, { "epoch": 0.8209169054441261, "grad_norm": 4.5813703536987305, "learning_rate": 4.1104439348164994e-07, "loss": 1.8906, "step": 1719 }, { "epoch": 0.8213944603629417, "grad_norm": 4.78256368637085, "learning_rate": 4.0892246919264885e-07, "loss": 1.1005, "step": 1720 }, { "epoch": 0.8218720152817574, "grad_norm": 3.7493603229522705, "learning_rate": 4.068055481276506e-07, "loss": 1.5754, "step": 1721 }, { "epoch": 0.8223495702005731, "grad_norm": 4.690057277679443, "learning_rate": 4.046936353517139e-07, "loss": 1.6016, "step": 1722 }, { "epoch": 0.8228271251193887, "grad_norm": 4.806550025939941, "learning_rate": 4.0258673591791475e-07, "loss": 1.1216, "step": 1723 }, { "epoch": 0.8233046800382043, "grad_norm": 5.5739359855651855, "learning_rate": 4.0048485486733233e-07, "loss": 1.3695, "step": 1724 }, { "epoch": 0.8237822349570201, "grad_norm": 4.634397029876709, "learning_rate": 3.983879972290405e-07, "loss": 1.8172, "step": 1725 }, { "epoch": 0.8242597898758357, "grad_norm": 5.844332218170166, "learning_rate": 3.962961680200927e-07, "loss": 1.3054, "step": 1726 }, { "epoch": 0.8247373447946514, "grad_norm": 4.119032382965088, "learning_rate": 3.9420937224551174e-07, "loss": 1.2361, "step": 1727 }, { "epoch": 0.8252148997134671, "grad_norm": 4.602118492126465, "learning_rate": 3.921276148982764e-07, "loss": 1.2873, "step": 1728 }, { "epoch": 0.8256924546322827, "grad_norm": 4.272244453430176, "learning_rate": 3.900509009593112e-07, "loss": 2.0348, "step": 1729 }, { "epoch": 0.8261700095510984, "grad_norm": 4.121852397918701, "learning_rate": 3.87979235397474e-07, "loss": 1.2263, "step": 1730 }, { "epoch": 0.826647564469914, "grad_norm": 4.643444061279297, "learning_rate": 3.8591262316954066e-07, "loss": 1.4154, "step": 1731 }, { "epoch": 0.8271251193887297, "grad_norm": 4.750778675079346, "learning_rate": 3.838510692201994e-07, "loss": 1.106, "step": 1732 }, { "epoch": 0.8276026743075454, "grad_norm": 5.1018967628479, "learning_rate": 3.8179457848203397e-07, "loss": 1.5423, "step": 1733 }, { "epoch": 0.828080229226361, "grad_norm": 4.026932716369629, "learning_rate": 3.7974315587551464e-07, "loss": 1.0533, "step": 1734 }, { "epoch": 0.8285577841451767, "grad_norm": 5.242611885070801, "learning_rate": 3.7769680630898476e-07, "loss": 1.2657, "step": 1735 }, { "epoch": 0.8290353390639924, "grad_norm": 4.017070293426514, "learning_rate": 3.7565553467864975e-07, "loss": 1.0183, "step": 1736 }, { "epoch": 0.829512893982808, "grad_norm": 3.9290740489959717, "learning_rate": 3.7361934586856525e-07, "loss": 1.8112, "step": 1737 }, { "epoch": 0.8299904489016237, "grad_norm": 5.744692325592041, "learning_rate": 3.715882447506258e-07, "loss": 0.8312, "step": 1738 }, { "epoch": 0.8304680038204394, "grad_norm": 4.616058826446533, "learning_rate": 3.695622361845533e-07, "loss": 1.1687, "step": 1739 }, { "epoch": 0.830945558739255, "grad_norm": 4.0054521560668945, "learning_rate": 3.675413250178828e-07, "loss": 1.5116, "step": 1740 }, { "epoch": 0.8314231136580706, "grad_norm": 4.3560709953308105, "learning_rate": 3.65525516085955e-07, "loss": 1.4915, "step": 1741 }, { "epoch": 0.8319006685768864, "grad_norm": 4.568493366241455, "learning_rate": 3.6351481421190244e-07, "loss": 0.8731, "step": 1742 }, { "epoch": 0.832378223495702, "grad_norm": 4.514004707336426, "learning_rate": 3.6150922420663814e-07, "loss": 0.8473, "step": 1743 }, { "epoch": 0.8328557784145176, "grad_norm": 4.377969741821289, "learning_rate": 3.5950875086884414e-07, "loss": 2.005, "step": 1744 }, { "epoch": 0.8333333333333334, "grad_norm": 5.062459468841553, "learning_rate": 3.575133989849597e-07, "loss": 1.598, "step": 1745 }, { "epoch": 0.833810888252149, "grad_norm": 4.346215724945068, "learning_rate": 3.555231733291706e-07, "loss": 1.4245, "step": 1746 }, { "epoch": 0.8342884431709646, "grad_norm": 4.002199649810791, "learning_rate": 3.5353807866339715e-07, "loss": 1.109, "step": 1747 }, { "epoch": 0.8347659980897804, "grad_norm": 4.35430383682251, "learning_rate": 3.5155811973728385e-07, "loss": 1.8954, "step": 1748 }, { "epoch": 0.835243553008596, "grad_norm": 3.863189697265625, "learning_rate": 3.495833012881852e-07, "loss": 1.3992, "step": 1749 }, { "epoch": 0.8357211079274116, "grad_norm": 4.0198588371276855, "learning_rate": 3.4761362804115797e-07, "loss": 1.6531, "step": 1750 }, { "epoch": 0.8361986628462273, "grad_norm": 5.7762322425842285, "learning_rate": 3.45649104708948e-07, "loss": 1.2657, "step": 1751 }, { "epoch": 0.836676217765043, "grad_norm": 4.907438278198242, "learning_rate": 3.4368973599197857e-07, "loss": 1.919, "step": 1752 }, { "epoch": 0.8371537726838586, "grad_norm": 4.101246356964111, "learning_rate": 3.4173552657834066e-07, "loss": 1.3019, "step": 1753 }, { "epoch": 0.8376313276026743, "grad_norm": 4.440704345703125, "learning_rate": 3.3978648114377955e-07, "loss": 2.0144, "step": 1754 }, { "epoch": 0.83810888252149, "grad_norm": 4.744243144989014, "learning_rate": 3.3784260435168637e-07, "loss": 1.8495, "step": 1755 }, { "epoch": 0.8385864374403056, "grad_norm": 3.686429738998413, "learning_rate": 3.3590390085308457e-07, "loss": 0.7315, "step": 1756 }, { "epoch": 0.8390639923591213, "grad_norm": 4.477352142333984, "learning_rate": 3.3397037528662023e-07, "loss": 2.017, "step": 1757 }, { "epoch": 0.839541547277937, "grad_norm": 4.250000476837158, "learning_rate": 3.320420322785489e-07, "loss": 0.9453, "step": 1758 }, { "epoch": 0.8400191021967526, "grad_norm": 5.021346092224121, "learning_rate": 3.3011887644272823e-07, "loss": 1.4765, "step": 1759 }, { "epoch": 0.8404966571155683, "grad_norm": 4.067678928375244, "learning_rate": 3.2820091238060347e-07, "loss": 1.571, "step": 1760 }, { "epoch": 0.8409742120343839, "grad_norm": 5.339605331420898, "learning_rate": 3.262881446811983e-07, "loss": 0.9515, "step": 1761 }, { "epoch": 0.8414517669531996, "grad_norm": 4.612814426422119, "learning_rate": 3.243805779211029e-07, "loss": 1.8214, "step": 1762 }, { "epoch": 0.8419293218720153, "grad_norm": 5.120960712432861, "learning_rate": 3.2247821666446357e-07, "loss": 2.2018, "step": 1763 }, { "epoch": 0.8424068767908309, "grad_norm": 3.7073988914489746, "learning_rate": 3.205810654629715e-07, "loss": 1.216, "step": 1764 }, { "epoch": 0.8428844317096467, "grad_norm": 5.399084568023682, "learning_rate": 3.1868912885585275e-07, "loss": 1.4474, "step": 1765 }, { "epoch": 0.8433619866284623, "grad_norm": 5.052760601043701, "learning_rate": 3.168024113698556e-07, "loss": 2.0036, "step": 1766 }, { "epoch": 0.8438395415472779, "grad_norm": 5.426851749420166, "learning_rate": 3.149209175192419e-07, "loss": 2.4607, "step": 1767 }, { "epoch": 0.8443170964660937, "grad_norm": 5.112466812133789, "learning_rate": 3.130446518057734e-07, "loss": 1.413, "step": 1768 }, { "epoch": 0.8447946513849093, "grad_norm": 4.737201690673828, "learning_rate": 3.1117361871870475e-07, "loss": 0.8943, "step": 1769 }, { "epoch": 0.8452722063037249, "grad_norm": 5.149539947509766, "learning_rate": 3.0930782273476926e-07, "loss": 1.8904, "step": 1770 }, { "epoch": 0.8457497612225406, "grad_norm": 5.280756950378418, "learning_rate": 3.0744726831817036e-07, "loss": 2.0399, "step": 1771 }, { "epoch": 0.8462273161413563, "grad_norm": 4.760005950927734, "learning_rate": 3.0559195992057064e-07, "loss": 1.2239, "step": 1772 }, { "epoch": 0.8467048710601719, "grad_norm": 4.715394496917725, "learning_rate": 3.0374190198108e-07, "loss": 1.1444, "step": 1773 }, { "epoch": 0.8471824259789876, "grad_norm": 3.589200735092163, "learning_rate": 3.0189709892624655e-07, "loss": 1.46, "step": 1774 }, { "epoch": 0.8476599808978033, "grad_norm": 4.947655200958252, "learning_rate": 3.000575551700438e-07, "loss": 1.2118, "step": 1775 }, { "epoch": 0.8481375358166189, "grad_norm": 4.110767364501953, "learning_rate": 2.9822327511386304e-07, "loss": 1.5332, "step": 1776 }, { "epoch": 0.8486150907354346, "grad_norm": 3.959704637527466, "learning_rate": 2.9639426314650083e-07, "loss": 2.0227, "step": 1777 }, { "epoch": 0.8490926456542502, "grad_norm": 4.084011554718018, "learning_rate": 2.94570523644149e-07, "loss": 1.1548, "step": 1778 }, { "epoch": 0.8495702005730659, "grad_norm": 5.003361701965332, "learning_rate": 2.927520609703846e-07, "loss": 0.7823, "step": 1779 }, { "epoch": 0.8500477554918816, "grad_norm": 4.0405964851379395, "learning_rate": 2.909388794761586e-07, "loss": 2.1382, "step": 1780 }, { "epoch": 0.8505253104106972, "grad_norm": 4.243015766143799, "learning_rate": 2.8913098349978577e-07, "loss": 1.544, "step": 1781 }, { "epoch": 0.8510028653295129, "grad_norm": 5.845482349395752, "learning_rate": 2.8732837736693526e-07, "loss": 1.2668, "step": 1782 }, { "epoch": 0.8514804202483286, "grad_norm": 5.443756580352783, "learning_rate": 2.855310653906193e-07, "loss": 1.0462, "step": 1783 }, { "epoch": 0.8519579751671442, "grad_norm": 4.132325649261475, "learning_rate": 2.837390518711816e-07, "loss": 0.7479, "step": 1784 }, { "epoch": 0.8524355300859598, "grad_norm": 3.8180594444274902, "learning_rate": 2.8195234109629015e-07, "loss": 1.0983, "step": 1785 }, { "epoch": 0.8529130850047756, "grad_norm": 6.074723720550537, "learning_rate": 2.801709373409248e-07, "loss": 1.6576, "step": 1786 }, { "epoch": 0.8533906399235912, "grad_norm": 3.3697400093078613, "learning_rate": 2.7839484486736734e-07, "loss": 0.5057, "step": 1787 }, { "epoch": 0.8538681948424068, "grad_norm": 4.754940032958984, "learning_rate": 2.7662406792519194e-07, "loss": 2.0405, "step": 1788 }, { "epoch": 0.8543457497612226, "grad_norm": 4.465798377990723, "learning_rate": 2.7485861075125406e-07, "loss": 1.2672, "step": 1789 }, { "epoch": 0.8548233046800382, "grad_norm": 4.801855087280273, "learning_rate": 2.730984775696813e-07, "loss": 1.9327, "step": 1790 }, { "epoch": 0.8553008595988538, "grad_norm": 4.0686259269714355, "learning_rate": 2.7134367259186197e-07, "loss": 1.9658, "step": 1791 }, { "epoch": 0.8557784145176696, "grad_norm": 4.476724624633789, "learning_rate": 2.6959420001643727e-07, "loss": 1.0439, "step": 1792 }, { "epoch": 0.8562559694364852, "grad_norm": 3.6586802005767822, "learning_rate": 2.6785006402928804e-07, "loss": 1.4278, "step": 1793 }, { "epoch": 0.8567335243553008, "grad_norm": 5.031531810760498, "learning_rate": 2.661112688035275e-07, "loss": 1.829, "step": 1794 }, { "epoch": 0.8572110792741165, "grad_norm": 5.884171009063721, "learning_rate": 2.643778184994905e-07, "loss": 1.2942, "step": 1795 }, { "epoch": 0.8576886341929322, "grad_norm": 5.2067790031433105, "learning_rate": 2.626497172647227e-07, "loss": 1.7838, "step": 1796 }, { "epoch": 0.8581661891117478, "grad_norm": 4.9938483238220215, "learning_rate": 2.6092696923397163e-07, "loss": 1.4381, "step": 1797 }, { "epoch": 0.8586437440305635, "grad_norm": 4.701908588409424, "learning_rate": 2.592095785291765e-07, "loss": 1.3941, "step": 1798 }, { "epoch": 0.8591212989493792, "grad_norm": 4.805972576141357, "learning_rate": 2.5749754925945834e-07, "loss": 1.513, "step": 1799 }, { "epoch": 0.8595988538681948, "grad_norm": 4.475127220153809, "learning_rate": 2.5579088552111e-07, "loss": 1.2681, "step": 1800 }, { "epoch": 0.8600764087870105, "grad_norm": 4.898892879486084, "learning_rate": 2.540895913975866e-07, "loss": 1.6902, "step": 1801 }, { "epoch": 0.8605539637058262, "grad_norm": 4.951174736022949, "learning_rate": 2.523936709594951e-07, "loss": 1.2134, "step": 1802 }, { "epoch": 0.8610315186246418, "grad_norm": 4.680129051208496, "learning_rate": 2.5070312826458525e-07, "loss": 1.0096, "step": 1803 }, { "epoch": 0.8615090735434575, "grad_norm": 4.526504993438721, "learning_rate": 2.490179673577406e-07, "loss": 2.2063, "step": 1804 }, { "epoch": 0.8619866284622731, "grad_norm": 3.7660071849823, "learning_rate": 2.473381922709672e-07, "loss": 1.0085, "step": 1805 }, { "epoch": 0.8624641833810889, "grad_norm": 4.789629936218262, "learning_rate": 2.456638070233844e-07, "loss": 1.2357, "step": 1806 }, { "epoch": 0.8629417382999045, "grad_norm": 4.6714606285095215, "learning_rate": 2.4399481562121656e-07, "loss": 1.176, "step": 1807 }, { "epoch": 0.8634192932187201, "grad_norm": 5.056303024291992, "learning_rate": 2.423312220577814e-07, "loss": 1.4109, "step": 1808 }, { "epoch": 0.8638968481375359, "grad_norm": 4.395562171936035, "learning_rate": 2.406730303134819e-07, "loss": 0.9802, "step": 1809 }, { "epoch": 0.8643744030563515, "grad_norm": 4.336032867431641, "learning_rate": 2.390202443557968e-07, "loss": 1.8814, "step": 1810 }, { "epoch": 0.8648519579751671, "grad_norm": 4.696137428283691, "learning_rate": 2.3737286813927007e-07, "loss": 1.5936, "step": 1811 }, { "epoch": 0.8653295128939829, "grad_norm": 4.970513343811035, "learning_rate": 2.3573090560550154e-07, "loss": 1.0399, "step": 1812 }, { "epoch": 0.8658070678127985, "grad_norm": 4.015129089355469, "learning_rate": 2.3409436068313944e-07, "loss": 1.561, "step": 1813 }, { "epoch": 0.8662846227316141, "grad_norm": 4.777578353881836, "learning_rate": 2.3246323728786824e-07, "loss": 1.5662, "step": 1814 }, { "epoch": 0.8667621776504298, "grad_norm": 4.743537425994873, "learning_rate": 2.3083753932240106e-07, "loss": 1.8866, "step": 1815 }, { "epoch": 0.8672397325692455, "grad_norm": 3.9787492752075195, "learning_rate": 2.2921727067647032e-07, "loss": 1.2108, "step": 1816 }, { "epoch": 0.8677172874880611, "grad_norm": 5.038753986358643, "learning_rate": 2.2760243522681718e-07, "loss": 1.209, "step": 1817 }, { "epoch": 0.8681948424068768, "grad_norm": 5.198272228240967, "learning_rate": 2.2599303683718376e-07, "loss": 1.3369, "step": 1818 }, { "epoch": 0.8686723973256925, "grad_norm": 4.993763446807861, "learning_rate": 2.2438907935830228e-07, "loss": 1.0487, "step": 1819 }, { "epoch": 0.8691499522445081, "grad_norm": 4.0358991622924805, "learning_rate": 2.2279056662788706e-07, "loss": 0.6877, "step": 1820 }, { "epoch": 0.8696275071633238, "grad_norm": 4.10168981552124, "learning_rate": 2.211975024706256e-07, "loss": 1.1834, "step": 1821 }, { "epoch": 0.8701050620821394, "grad_norm": 4.83210563659668, "learning_rate": 2.1960989069816835e-07, "loss": 0.9045, "step": 1822 }, { "epoch": 0.8705826170009551, "grad_norm": 4.804450035095215, "learning_rate": 2.1802773510912033e-07, "loss": 1.1909, "step": 1823 }, { "epoch": 0.8710601719197708, "grad_norm": 3.981300115585327, "learning_rate": 2.164510394890315e-07, "loss": 1.3505, "step": 1824 }, { "epoch": 0.8715377268385864, "grad_norm": 5.746732711791992, "learning_rate": 2.1487980761038834e-07, "loss": 1.0668, "step": 1825 }, { "epoch": 0.8720152817574021, "grad_norm": 4.432000637054443, "learning_rate": 2.133140432326039e-07, "loss": 1.051, "step": 1826 }, { "epoch": 0.8724928366762178, "grad_norm": 4.487666606903076, "learning_rate": 2.1175375010201088e-07, "loss": 1.4609, "step": 1827 }, { "epoch": 0.8729703915950334, "grad_norm": 4.722477436065674, "learning_rate": 2.101989319518488e-07, "loss": 1.925, "step": 1828 }, { "epoch": 0.873447946513849, "grad_norm": 4.714040279388428, "learning_rate": 2.0864959250225942e-07, "loss": 1.3879, "step": 1829 }, { "epoch": 0.8739255014326648, "grad_norm": 4.175288677215576, "learning_rate": 2.0710573546027513e-07, "loss": 1.4922, "step": 1830 }, { "epoch": 0.8744030563514804, "grad_norm": 5.325249671936035, "learning_rate": 2.0556736451981086e-07, "loss": 1.2637, "step": 1831 }, { "epoch": 0.874880611270296, "grad_norm": 5.063163757324219, "learning_rate": 2.0403448336165532e-07, "loss": 1.4604, "step": 1832 }, { "epoch": 0.8753581661891118, "grad_norm": 4.433059215545654, "learning_rate": 2.0250709565346194e-07, "loss": 2.2617, "step": 1833 }, { "epoch": 0.8758357211079274, "grad_norm": 4.04313850402832, "learning_rate": 2.0098520504974045e-07, "loss": 0.8336, "step": 1834 }, { "epoch": 0.876313276026743, "grad_norm": 4.104918003082275, "learning_rate": 1.994688151918478e-07, "loss": 1.2973, "step": 1835 }, { "epoch": 0.8767908309455588, "grad_norm": 4.765949249267578, "learning_rate": 1.979579297079795e-07, "loss": 2.0416, "step": 1836 }, { "epoch": 0.8772683858643744, "grad_norm": 5.252695083618164, "learning_rate": 1.964525522131605e-07, "loss": 2.1165, "step": 1837 }, { "epoch": 0.87774594078319, "grad_norm": 3.980583429336548, "learning_rate": 1.9495268630923798e-07, "loss": 1.0635, "step": 1838 }, { "epoch": 0.8782234957020058, "grad_norm": 4.944131851196289, "learning_rate": 1.9345833558487126e-07, "loss": 1.8253, "step": 1839 }, { "epoch": 0.8787010506208214, "grad_norm": 4.8832902908325195, "learning_rate": 1.919695036155239e-07, "loss": 1.5908, "step": 1840 }, { "epoch": 0.879178605539637, "grad_norm": 3.6236624717712402, "learning_rate": 1.9048619396345502e-07, "loss": 1.1423, "step": 1841 }, { "epoch": 0.8796561604584527, "grad_norm": 4.329488277435303, "learning_rate": 1.8900841017771066e-07, "loss": 1.2654, "step": 1842 }, { "epoch": 0.8801337153772684, "grad_norm": 4.656091213226318, "learning_rate": 1.8753615579411522e-07, "loss": 1.7989, "step": 1843 }, { "epoch": 0.8806112702960841, "grad_norm": 4.736489295959473, "learning_rate": 1.8606943433526343e-07, "loss": 1.1712, "step": 1844 }, { "epoch": 0.8810888252148997, "grad_norm": 4.346367835998535, "learning_rate": 1.8460824931051197e-07, "loss": 2.0078, "step": 1845 }, { "epoch": 0.8815663801337154, "grad_norm": 5.270775318145752, "learning_rate": 1.8315260421596925e-07, "loss": 1.2601, "step": 1846 }, { "epoch": 0.8820439350525311, "grad_norm": 4.949428081512451, "learning_rate": 1.8170250253449067e-07, "loss": 1.3144, "step": 1847 }, { "epoch": 0.8825214899713467, "grad_norm": 3.914212942123413, "learning_rate": 1.802579477356664e-07, "loss": 1.1926, "step": 1848 }, { "epoch": 0.8829990448901623, "grad_norm": 5.574470043182373, "learning_rate": 1.7881894327581612e-07, "loss": 1.0381, "step": 1849 }, { "epoch": 0.8834765998089781, "grad_norm": 5.225996494293213, "learning_rate": 1.7738549259797843e-07, "loss": 1.7549, "step": 1850 }, { "epoch": 0.8839541547277937, "grad_norm": 4.652002334594727, "learning_rate": 1.7595759913190457e-07, "loss": 1.4756, "step": 1851 }, { "epoch": 0.8844317096466093, "grad_norm": 5.426699638366699, "learning_rate": 1.7453526629404831e-07, "loss": 2.0966, "step": 1852 }, { "epoch": 0.8849092645654251, "grad_norm": 6.046806335449219, "learning_rate": 1.7311849748755993e-07, "loss": 1.0931, "step": 1853 }, { "epoch": 0.8853868194842407, "grad_norm": 5.215230464935303, "learning_rate": 1.7170729610227616e-07, "loss": 1.1477, "step": 1854 }, { "epoch": 0.8858643744030563, "grad_norm": 4.596120834350586, "learning_rate": 1.70301665514713e-07, "loss": 1.0148, "step": 1855 }, { "epoch": 0.8863419293218721, "grad_norm": 3.8564343452453613, "learning_rate": 1.689016090880566e-07, "loss": 1.3548, "step": 1856 }, { "epoch": 0.8868194842406877, "grad_norm": 4.367040157318115, "learning_rate": 1.6750713017215787e-07, "loss": 1.1242, "step": 1857 }, { "epoch": 0.8872970391595033, "grad_norm": 5.239226341247559, "learning_rate": 1.661182321035207e-07, "loss": 0.8537, "step": 1858 }, { "epoch": 0.887774594078319, "grad_norm": 5.1966633796691895, "learning_rate": 1.647349182052982e-07, "loss": 1.277, "step": 1859 }, { "epoch": 0.8882521489971347, "grad_norm": 5.194746017456055, "learning_rate": 1.6335719178728033e-07, "loss": 1.9275, "step": 1860 }, { "epoch": 0.8887297039159503, "grad_norm": 4.254364013671875, "learning_rate": 1.6198505614588967e-07, "loss": 1.9244, "step": 1861 }, { "epoch": 0.889207258834766, "grad_norm": 5.427789688110352, "learning_rate": 1.6061851456417117e-07, "loss": 1.9553, "step": 1862 }, { "epoch": 0.8896848137535817, "grad_norm": 5.022941589355469, "learning_rate": 1.5925757031178463e-07, "loss": 1.9402, "step": 1863 }, { "epoch": 0.8901623686723973, "grad_norm": 5.901017189025879, "learning_rate": 1.5790222664499895e-07, "loss": 1.5916, "step": 1864 }, { "epoch": 0.890639923591213, "grad_norm": 5.962346076965332, "learning_rate": 1.5655248680668173e-07, "loss": 1.6842, "step": 1865 }, { "epoch": 0.8911174785100286, "grad_norm": 4.637740612030029, "learning_rate": 1.5520835402629247e-07, "loss": 1.7114, "step": 1866 }, { "epoch": 0.8915950334288443, "grad_norm": 4.626884937286377, "learning_rate": 1.5386983151987527e-07, "loss": 1.813, "step": 1867 }, { "epoch": 0.89207258834766, "grad_norm": 4.88906717300415, "learning_rate": 1.525369224900511e-07, "loss": 1.0962, "step": 1868 }, { "epoch": 0.8925501432664756, "grad_norm": 5.213235855102539, "learning_rate": 1.512096301260088e-07, "loss": 1.9853, "step": 1869 }, { "epoch": 0.8930276981852913, "grad_norm": 4.625570297241211, "learning_rate": 1.4988795760349978e-07, "loss": 1.5201, "step": 1870 }, { "epoch": 0.893505253104107, "grad_norm": 6.049704551696777, "learning_rate": 1.485719080848283e-07, "loss": 1.6123, "step": 1871 }, { "epoch": 0.8939828080229226, "grad_norm": 4.393548488616943, "learning_rate": 1.4726148471884443e-07, "loss": 1.2059, "step": 1872 }, { "epoch": 0.8944603629417383, "grad_norm": 5.259336471557617, "learning_rate": 1.4595669064093737e-07, "loss": 1.8937, "step": 1873 }, { "epoch": 0.894937917860554, "grad_norm": 6.023671627044678, "learning_rate": 1.446575289730273e-07, "loss": 1.1309, "step": 1874 }, { "epoch": 0.8954154727793696, "grad_norm": 3.925832986831665, "learning_rate": 1.4336400282355832e-07, "loss": 1.1099, "step": 1875 }, { "epoch": 0.8958930276981852, "grad_norm": 4.363374710083008, "learning_rate": 1.4207611528749e-07, "loss": 1.1467, "step": 1876 }, { "epoch": 0.896370582617001, "grad_norm": 5.333053112030029, "learning_rate": 1.4079386944629069e-07, "loss": 1.7233, "step": 1877 }, { "epoch": 0.8968481375358166, "grad_norm": 5.328066825866699, "learning_rate": 1.3951726836793106e-07, "loss": 1.0692, "step": 1878 }, { "epoch": 0.8973256924546322, "grad_norm": 4.231527805328369, "learning_rate": 1.3824631510687468e-07, "loss": 1.2641, "step": 1879 }, { "epoch": 0.897803247373448, "grad_norm": 3.790458917617798, "learning_rate": 1.369810127040727e-07, "loss": 1.0317, "step": 1880 }, { "epoch": 0.8982808022922636, "grad_norm": 4.294885158538818, "learning_rate": 1.3572136418695507e-07, "loss": 1.2742, "step": 1881 }, { "epoch": 0.8987583572110793, "grad_norm": 4.326846599578857, "learning_rate": 1.3446737256942426e-07, "loss": 0.6941, "step": 1882 }, { "epoch": 0.899235912129895, "grad_norm": 4.579689979553223, "learning_rate": 1.3321904085184773e-07, "loss": 0.7801, "step": 1883 }, { "epoch": 0.8997134670487106, "grad_norm": 4.973845958709717, "learning_rate": 1.319763720210507e-07, "loss": 1.4939, "step": 1884 }, { "epoch": 0.9001910219675263, "grad_norm": 4.610194683074951, "learning_rate": 1.3073936905030947e-07, "loss": 2.2352, "step": 1885 }, { "epoch": 0.9006685768863419, "grad_norm": 4.39163064956665, "learning_rate": 1.2950803489934328e-07, "loss": 2.133, "step": 1886 }, { "epoch": 0.9011461318051576, "grad_norm": 4.580910682678223, "learning_rate": 1.2828237251430785e-07, "loss": 0.8098, "step": 1887 }, { "epoch": 0.9016236867239733, "grad_norm": 4.559267520904541, "learning_rate": 1.270623848277891e-07, "loss": 0.7889, "step": 1888 }, { "epoch": 0.9021012416427889, "grad_norm": 6.220565319061279, "learning_rate": 1.2584807475879478e-07, "loss": 1.1253, "step": 1889 }, { "epoch": 0.9025787965616046, "grad_norm": 4.314452648162842, "learning_rate": 1.246394452127478e-07, "loss": 1.5035, "step": 1890 }, { "epoch": 0.9030563514804203, "grad_norm": 5.102320671081543, "learning_rate": 1.2343649908148015e-07, "loss": 1.0031, "step": 1891 }, { "epoch": 0.9035339063992359, "grad_norm": 4.642854690551758, "learning_rate": 1.2223923924322478e-07, "loss": 1.2687, "step": 1892 }, { "epoch": 0.9040114613180515, "grad_norm": 5.470934867858887, "learning_rate": 1.2104766856261024e-07, "loss": 1.5153, "step": 1893 }, { "epoch": 0.9044890162368673, "grad_norm": 4.981511116027832, "learning_rate": 1.1986178989065178e-07, "loss": 1.6634, "step": 1894 }, { "epoch": 0.9049665711556829, "grad_norm": 5.087421894073486, "learning_rate": 1.1868160606474638e-07, "loss": 1.7137, "step": 1895 }, { "epoch": 0.9054441260744985, "grad_norm": 4.600591659545898, "learning_rate": 1.1750711990866525e-07, "loss": 1.4144, "step": 1896 }, { "epoch": 0.9059216809933143, "grad_norm": 4.7324442863464355, "learning_rate": 1.1633833423254637e-07, "loss": 1.2583, "step": 1897 }, { "epoch": 0.9063992359121299, "grad_norm": 4.637341022491455, "learning_rate": 1.1517525183288913e-07, "loss": 1.088, "step": 1898 }, { "epoch": 0.9068767908309455, "grad_norm": 4.929794788360596, "learning_rate": 1.1401787549254695e-07, "loss": 1.822, "step": 1899 }, { "epoch": 0.9073543457497613, "grad_norm": 5.193170070648193, "learning_rate": 1.1286620798071995e-07, "loss": 1.0602, "step": 1900 }, { "epoch": 0.9078319006685769, "grad_norm": 4.935697078704834, "learning_rate": 1.1172025205294951e-07, "loss": 1.3836, "step": 1901 }, { "epoch": 0.9083094555873925, "grad_norm": 3.6956772804260254, "learning_rate": 1.1058001045111122e-07, "loss": 0.7405, "step": 1902 }, { "epoch": 0.9087870105062082, "grad_norm": 4.060792922973633, "learning_rate": 1.0944548590340831e-07, "loss": 1.2824, "step": 1903 }, { "epoch": 0.9092645654250239, "grad_norm": 4.516481399536133, "learning_rate": 1.0831668112436495e-07, "loss": 1.5684, "step": 1904 }, { "epoch": 0.9097421203438395, "grad_norm": 4.162242412567139, "learning_rate": 1.0719359881481955e-07, "loss": 1.5459, "step": 1905 }, { "epoch": 0.9102196752626552, "grad_norm": 5.568398952484131, "learning_rate": 1.060762416619196e-07, "loss": 1.2594, "step": 1906 }, { "epoch": 0.9106972301814709, "grad_norm": 4.66248893737793, "learning_rate": 1.0496461233911265e-07, "loss": 1.1952, "step": 1907 }, { "epoch": 0.9111747851002865, "grad_norm": 4.602356910705566, "learning_rate": 1.038587135061428e-07, "loss": 2.0775, "step": 1908 }, { "epoch": 0.9116523400191022, "grad_norm": 4.780706405639648, "learning_rate": 1.0275854780904315e-07, "loss": 1.2421, "step": 1909 }, { "epoch": 0.9121298949379179, "grad_norm": 4.321216106414795, "learning_rate": 1.0166411788012892e-07, "loss": 1.774, "step": 1910 }, { "epoch": 0.9126074498567335, "grad_norm": 5.077922821044922, "learning_rate": 1.0057542633799155e-07, "loss": 1.8508, "step": 1911 }, { "epoch": 0.9130850047755492, "grad_norm": 4.731849670410156, "learning_rate": 9.949247578749294e-08, "loss": 1.4742, "step": 1912 }, { "epoch": 0.9135625596943648, "grad_norm": 5.151985168457031, "learning_rate": 9.841526881975816e-08, "loss": 0.9405, "step": 1913 }, { "epoch": 0.9140401146131805, "grad_norm": 4.161082744598389, "learning_rate": 9.734380801217053e-08, "loss": 1.5469, "step": 1914 }, { "epoch": 0.9145176695319962, "grad_norm": 3.7338144779205322, "learning_rate": 9.627809592836462e-08, "loss": 1.1333, "step": 1915 }, { "epoch": 0.9149952244508118, "grad_norm": 4.189218044281006, "learning_rate": 9.521813511821992e-08, "loss": 1.9014, "step": 1916 }, { "epoch": 0.9154727793696275, "grad_norm": 4.221567153930664, "learning_rate": 9.416392811785524e-08, "loss": 2.128, "step": 1917 }, { "epoch": 0.9159503342884432, "grad_norm": 4.393245220184326, "learning_rate": 9.311547744962291e-08, "loss": 1.5716, "step": 1918 }, { "epoch": 0.9164278892072588, "grad_norm": 4.907567501068115, "learning_rate": 9.207278562210236e-08, "loss": 1.3955, "step": 1919 }, { "epoch": 0.9169054441260746, "grad_norm": 3.8443729877471924, "learning_rate": 9.103585513009327e-08, "loss": 1.776, "step": 1920 }, { "epoch": 0.9173829990448902, "grad_norm": 4.198541641235352, "learning_rate": 9.000468845461158e-08, "loss": 0.9206, "step": 1921 }, { "epoch": 0.9178605539637058, "grad_norm": 4.565410137176514, "learning_rate": 8.897928806288176e-08, "loss": 0.8683, "step": 1922 }, { "epoch": 0.9183381088825215, "grad_norm": 4.908368110656738, "learning_rate": 8.795965640833126e-08, "loss": 1.9634, "step": 1923 }, { "epoch": 0.9188156638013372, "grad_norm": 4.630736827850342, "learning_rate": 8.694579593058583e-08, "loss": 0.8907, "step": 1924 }, { "epoch": 0.9192932187201528, "grad_norm": 4.463395118713379, "learning_rate": 8.593770905546195e-08, "loss": 1.4228, "step": 1925 }, { "epoch": 0.9197707736389685, "grad_norm": 5.2711567878723145, "learning_rate": 8.493539819496243e-08, "loss": 1.5696, "step": 1926 }, { "epoch": 0.9202483285577842, "grad_norm": 5.236730098724365, "learning_rate": 8.393886574726979e-08, "loss": 2.0057, "step": 1927 }, { "epoch": 0.9207258834765998, "grad_norm": 4.793929100036621, "learning_rate": 8.294811409674086e-08, "loss": 1.5279, "step": 1928 }, { "epoch": 0.9212034383954155, "grad_norm": 4.622511863708496, "learning_rate": 8.196314561390112e-08, "loss": 1.0851, "step": 1929 }, { "epoch": 0.9216809933142311, "grad_norm": 4.6666154861450195, "learning_rate": 8.098396265543873e-08, "loss": 1.2831, "step": 1930 }, { "epoch": 0.9221585482330468, "grad_norm": 4.627532482147217, "learning_rate": 8.001056756419933e-08, "loss": 1.3361, "step": 1931 }, { "epoch": 0.9226361031518625, "grad_norm": 5.148006916046143, "learning_rate": 7.90429626691805e-08, "loss": 1.5478, "step": 1932 }, { "epoch": 0.9231136580706781, "grad_norm": 6.060977935791016, "learning_rate": 7.808115028552498e-08, "loss": 1.3051, "step": 1933 }, { "epoch": 0.9235912129894938, "grad_norm": 5.027886867523193, "learning_rate": 7.712513271451672e-08, "loss": 2.0222, "step": 1934 }, { "epoch": 0.9240687679083095, "grad_norm": 5.360738277435303, "learning_rate": 7.617491224357427e-08, "loss": 1.0222, "step": 1935 }, { "epoch": 0.9245463228271251, "grad_norm": 5.642928600311279, "learning_rate": 7.523049114624647e-08, "loss": 1.0464, "step": 1936 }, { "epoch": 0.9250238777459407, "grad_norm": 4.5053791999816895, "learning_rate": 7.429187168220526e-08, "loss": 1.5674, "step": 1937 }, { "epoch": 0.9255014326647565, "grad_norm": 4.781852722167969, "learning_rate": 7.33590560972422e-08, "loss": 1.3417, "step": 1938 }, { "epoch": 0.9259789875835721, "grad_norm": 4.094886779785156, "learning_rate": 7.24320466232617e-08, "loss": 1.6142, "step": 1939 }, { "epoch": 0.9264565425023877, "grad_norm": 5.062748908996582, "learning_rate": 7.151084547827619e-08, "loss": 1.6717, "step": 1940 }, { "epoch": 0.9269340974212035, "grad_norm": 4.518438816070557, "learning_rate": 7.059545486640063e-08, "loss": 0.889, "step": 1941 }, { "epoch": 0.9274116523400191, "grad_norm": 5.465991973876953, "learning_rate": 6.968587697784801e-08, "loss": 1.503, "step": 1942 }, { "epoch": 0.9278892072588347, "grad_norm": 4.978146553039551, "learning_rate": 6.878211398892276e-08, "loss": 1.5685, "step": 1943 }, { "epoch": 0.9283667621776505, "grad_norm": 3.893017530441284, "learning_rate": 6.788416806201626e-08, "loss": 1.4475, "step": 1944 }, { "epoch": 0.9288443170964661, "grad_norm": 4.326538562774658, "learning_rate": 6.699204134560266e-08, "loss": 1.905, "step": 1945 }, { "epoch": 0.9293218720152817, "grad_norm": 4.6111979484558105, "learning_rate": 6.610573597423175e-08, "loss": 1.2542, "step": 1946 }, { "epoch": 0.9297994269340975, "grad_norm": 5.056522369384766, "learning_rate": 6.522525406852525e-08, "loss": 1.5773, "step": 1947 }, { "epoch": 0.9302769818529131, "grad_norm": 5.050508975982666, "learning_rate": 6.435059773517133e-08, "loss": 0.964, "step": 1948 }, { "epoch": 0.9307545367717287, "grad_norm": 4.646449565887451, "learning_rate": 6.348176906691928e-08, "loss": 1.776, "step": 1949 }, { "epoch": 0.9312320916905444, "grad_norm": 5.053176403045654, "learning_rate": 6.261877014257567e-08, "loss": 1.6807, "step": 1950 }, { "epoch": 0.9317096466093601, "grad_norm": 4.885444641113281, "learning_rate": 6.176160302699713e-08, "loss": 1.6078, "step": 1951 }, { "epoch": 0.9321872015281757, "grad_norm": 4.299678802490234, "learning_rate": 6.091026977108783e-08, "loss": 1.8217, "step": 1952 }, { "epoch": 0.9326647564469914, "grad_norm": 4.279200553894043, "learning_rate": 6.006477241179365e-08, "loss": 1.1873, "step": 1953 }, { "epoch": 0.933142311365807, "grad_norm": 4.553717136383057, "learning_rate": 5.9225112972096375e-08, "loss": 1.2049, "step": 1954 }, { "epoch": 0.9336198662846227, "grad_norm": 4.365349292755127, "learning_rate": 5.8391293461010355e-08, "loss": 0.9472, "step": 1955 }, { "epoch": 0.9340974212034384, "grad_norm": 3.930738687515259, "learning_rate": 5.7563315873576665e-08, "loss": 1.1603, "step": 1956 }, { "epoch": 0.934574976122254, "grad_norm": 4.760618209838867, "learning_rate": 5.674118219085867e-08, "loss": 1.1817, "step": 1957 }, { "epoch": 0.9350525310410697, "grad_norm": 5.481564044952393, "learning_rate": 5.5924894379937865e-08, "loss": 2.0948, "step": 1958 }, { "epoch": 0.9355300859598854, "grad_norm": 4.832895278930664, "learning_rate": 5.511445439390778e-08, "loss": 1.807, "step": 1959 }, { "epoch": 0.936007640878701, "grad_norm": 4.487087726593018, "learning_rate": 5.430986417187062e-08, "loss": 1.8288, "step": 1960 }, { "epoch": 0.9364851957975168, "grad_norm": 4.5076141357421875, "learning_rate": 5.351112563893174e-08, "loss": 1.2819, "step": 1961 }, { "epoch": 0.9369627507163324, "grad_norm": 5.209587097167969, "learning_rate": 5.27182407061963e-08, "loss": 0.9936, "step": 1962 }, { "epoch": 0.937440305635148, "grad_norm": 5.170856475830078, "learning_rate": 5.193121127076234e-08, "loss": 1.0956, "step": 1963 }, { "epoch": 0.9379178605539638, "grad_norm": 4.041329383850098, "learning_rate": 5.1150039215719374e-08, "loss": 1.0619, "step": 1964 }, { "epoch": 0.9383954154727794, "grad_norm": 4.888769626617432, "learning_rate": 5.0374726410141186e-08, "loss": 1.4852, "step": 1965 }, { "epoch": 0.938872970391595, "grad_norm": 5.521396160125732, "learning_rate": 4.9605274709082774e-08, "loss": 1.2681, "step": 1966 }, { "epoch": 0.9393505253104107, "grad_norm": 4.531164646148682, "learning_rate": 4.8841685953575356e-08, "loss": 1.3463, "step": 1967 }, { "epoch": 0.9398280802292264, "grad_norm": 5.33794641494751, "learning_rate": 4.8083961970622485e-08, "loss": 1.3276, "step": 1968 }, { "epoch": 0.940305635148042, "grad_norm": 4.757003307342529, "learning_rate": 4.7332104573194485e-08, "loss": 1.4977, "step": 1969 }, { "epoch": 0.9407831900668577, "grad_norm": 4.748263359069824, "learning_rate": 4.658611556022624e-08, "loss": 1.2734, "step": 1970 }, { "epoch": 0.9412607449856734, "grad_norm": 5.040325164794922, "learning_rate": 4.584599671661055e-08, "loss": 0.7417, "step": 1971 }, { "epoch": 0.941738299904489, "grad_norm": 5.456332206726074, "learning_rate": 4.5111749813195606e-08, "loss": 1.1551, "step": 1972 }, { "epoch": 0.9422158548233047, "grad_norm": 4.385427951812744, "learning_rate": 4.4383376606779995e-08, "loss": 1.5596, "step": 1973 }, { "epoch": 0.9426934097421203, "grad_norm": 5.934171676635742, "learning_rate": 4.366087884010828e-08, "loss": 0.9657, "step": 1974 }, { "epoch": 0.943170964660936, "grad_norm": 3.6707611083984375, "learning_rate": 4.2944258241867095e-08, "loss": 1.7616, "step": 1975 }, { "epoch": 0.9436485195797517, "grad_norm": 4.454045295715332, "learning_rate": 4.2233516526681286e-08, "loss": 1.4641, "step": 1976 }, { "epoch": 0.9441260744985673, "grad_norm": 4.442991733551025, "learning_rate": 4.152865539510997e-08, "loss": 1.5038, "step": 1977 }, { "epoch": 0.944603629417383, "grad_norm": 5.286077499389648, "learning_rate": 4.082967653364106e-08, "loss": 1.7087, "step": 1978 }, { "epoch": 0.9450811843361987, "grad_norm": 3.9384427070617676, "learning_rate": 4.01365816146887e-08, "loss": 1.5981, "step": 1979 }, { "epoch": 0.9455587392550143, "grad_norm": 4.8193159103393555, "learning_rate": 3.9449372296589405e-08, "loss": 2.0771, "step": 1980 }, { "epoch": 0.94603629417383, "grad_norm": 4.080482006072998, "learning_rate": 3.876805022359681e-08, "loss": 1.1512, "step": 1981 }, { "epoch": 0.9465138490926457, "grad_norm": 5.827645301818848, "learning_rate": 3.8092617025878295e-08, "loss": 1.2662, "step": 1982 }, { "epoch": 0.9469914040114613, "grad_norm": 4.895304203033447, "learning_rate": 3.742307431951198e-08, "loss": 1.2981, "step": 1983 }, { "epoch": 0.9474689589302769, "grad_norm": 4.78440523147583, "learning_rate": 3.675942370648084e-08, "loss": 1.259, "step": 1984 }, { "epoch": 0.9479465138490927, "grad_norm": 4.6514692306518555, "learning_rate": 3.610166677467136e-08, "loss": 1.5346, "step": 1985 }, { "epoch": 0.9484240687679083, "grad_norm": 4.771570205688477, "learning_rate": 3.5449805097868275e-08, "loss": 1.5018, "step": 1986 }, { "epoch": 0.9489016236867239, "grad_norm": 5.133676052093506, "learning_rate": 3.4803840235749784e-08, "loss": 1.2655, "step": 1987 }, { "epoch": 0.9493791786055397, "grad_norm": 5.038180351257324, "learning_rate": 3.4163773733886794e-08, "loss": 1.4964, "step": 1988 }, { "epoch": 0.9498567335243553, "grad_norm": 4.514855861663818, "learning_rate": 3.352960712373621e-08, "loss": 1.6508, "step": 1989 }, { "epoch": 0.9503342884431709, "grad_norm": 4.957178115844727, "learning_rate": 3.290134192263927e-08, "loss": 1.156, "step": 1990 }, { "epoch": 0.9508118433619867, "grad_norm": 3.8762965202331543, "learning_rate": 3.227897963381688e-08, "loss": 1.1921, "step": 1991 }, { "epoch": 0.9512893982808023, "grad_norm": 4.4491424560546875, "learning_rate": 3.166252174636647e-08, "loss": 1.1539, "step": 1992 }, { "epoch": 0.9517669531996179, "grad_norm": 5.313320636749268, "learning_rate": 3.1051969735257645e-08, "loss": 1.4086, "step": 1993 }, { "epoch": 0.9522445081184336, "grad_norm": 4.239004611968994, "learning_rate": 3.0447325061330746e-08, "loss": 1.3864, "step": 1994 }, { "epoch": 0.9527220630372493, "grad_norm": 5.58085823059082, "learning_rate": 2.984858917128991e-08, "loss": 1.3724, "step": 1995 }, { "epoch": 0.9531996179560649, "grad_norm": 4.391365051269531, "learning_rate": 2.9255763497703373e-08, "loss": 0.941, "step": 1996 }, { "epoch": 0.9536771728748806, "grad_norm": 5.233076572418213, "learning_rate": 2.866884945899734e-08, "loss": 1.7239, "step": 1997 }, { "epoch": 0.9541547277936963, "grad_norm": 4.5262041091918945, "learning_rate": 2.8087848459453505e-08, "loss": 1.8568, "step": 1998 }, { "epoch": 0.954632282712512, "grad_norm": 4.288240909576416, "learning_rate": 2.7512761889206542e-08, "loss": 1.4095, "step": 1999 }, { "epoch": 0.9551098376313276, "grad_norm": 4.01613712310791, "learning_rate": 2.6943591124238834e-08, "loss": 1.9149, "step": 2000 }, { "epoch": 0.9555873925501432, "grad_norm": 3.655914545059204, "learning_rate": 2.6380337526379086e-08, "loss": 1.6939, "step": 2001 }, { "epoch": 0.956064947468959, "grad_norm": 3.6653099060058594, "learning_rate": 2.5823002443298163e-08, "loss": 1.668, "step": 2002 }, { "epoch": 0.9565425023877746, "grad_norm": 5.357335567474365, "learning_rate": 2.527158720850548e-08, "loss": 1.7178, "step": 2003 }, { "epoch": 0.9570200573065902, "grad_norm": 4.772735595703125, "learning_rate": 2.4726093141346498e-08, "loss": 1.4631, "step": 2004 }, { "epoch": 0.957497612225406, "grad_norm": 4.847362518310547, "learning_rate": 2.4186521546999964e-08, "loss": 0.9362, "step": 2005 }, { "epoch": 0.9579751671442216, "grad_norm": 4.5885725021362305, "learning_rate": 2.3652873716473455e-08, "loss": 1.4706, "step": 2006 }, { "epoch": 0.9584527220630372, "grad_norm": 4.13776969909668, "learning_rate": 2.312515092660117e-08, "loss": 1.6504, "step": 2007 }, { "epoch": 0.958930276981853, "grad_norm": 5.218382835388184, "learning_rate": 2.2603354440041412e-08, "loss": 1.1673, "step": 2008 }, { "epoch": 0.9594078319006686, "grad_norm": 4.020388603210449, "learning_rate": 2.20874855052719e-08, "loss": 1.0894, "step": 2009 }, { "epoch": 0.9598853868194842, "grad_norm": 4.329903602600098, "learning_rate": 2.1577545356588625e-08, "loss": 1.7364, "step": 2010 }, { "epoch": 0.9603629417383, "grad_norm": 4.914790153503418, "learning_rate": 2.1073535214101436e-08, "loss": 1.6499, "step": 2011 }, { "epoch": 0.9608404966571156, "grad_norm": 4.253540992736816, "learning_rate": 2.0575456283732086e-08, "loss": 1.311, "step": 2012 }, { "epoch": 0.9613180515759312, "grad_norm": 6.226879119873047, "learning_rate": 2.00833097572109e-08, "loss": 2.0382, "step": 2013 }, { "epoch": 0.9617956064947469, "grad_norm": 5.070218563079834, "learning_rate": 1.9597096812073725e-08, "loss": 2.0847, "step": 2014 }, { "epoch": 0.9622731614135626, "grad_norm": 3.873964309692383, "learning_rate": 1.9116818611659703e-08, "loss": 1.3499, "step": 2015 }, { "epoch": 0.9627507163323782, "grad_norm": 4.883249759674072, "learning_rate": 1.8642476305108794e-08, "loss": 1.19, "step": 2016 }, { "epoch": 0.9632282712511939, "grad_norm": 4.980648517608643, "learning_rate": 1.817407102735702e-08, "loss": 1.2568, "step": 2017 }, { "epoch": 0.9637058261700095, "grad_norm": 5.162411212921143, "learning_rate": 1.7711603899136233e-08, "loss": 1.106, "step": 2018 }, { "epoch": 0.9641833810888252, "grad_norm": 4.110729217529297, "learning_rate": 1.725507602696991e-08, "loss": 1.1844, "step": 2019 }, { "epoch": 0.9646609360076409, "grad_norm": 4.54630184173584, "learning_rate": 1.6804488503171233e-08, "loss": 1.5459, "step": 2020 }, { "epoch": 0.9651384909264565, "grad_norm": 4.839606285095215, "learning_rate": 1.635984240584032e-08, "loss": 1.3916, "step": 2021 }, { "epoch": 0.9656160458452722, "grad_norm": 4.208919048309326, "learning_rate": 1.592113879886059e-08, "loss": 0.7994, "step": 2022 }, { "epoch": 0.9660936007640879, "grad_norm": 5.688157081604004, "learning_rate": 1.5488378731897957e-08, "loss": 0.8008, "step": 2023 }, { "epoch": 0.9665711556829035, "grad_norm": 5.372372150421143, "learning_rate": 1.5061563240397482e-08, "loss": 1.7116, "step": 2024 }, { "epoch": 0.9670487106017192, "grad_norm": 4.9363627433776855, "learning_rate": 1.4640693345580603e-08, "loss": 1.5353, "step": 2025 }, { "epoch": 0.9675262655205349, "grad_norm": 5.201495170593262, "learning_rate": 1.42257700544432e-08, "loss": 1.0658, "step": 2026 }, { "epoch": 0.9680038204393505, "grad_norm": 5.200769424438477, "learning_rate": 1.3816794359752806e-08, "loss": 1.883, "step": 2027 }, { "epoch": 0.9684813753581661, "grad_norm": 4.4885334968566895, "learning_rate": 1.3413767240046949e-08, "loss": 1.3214, "step": 2028 }, { "epoch": 0.9689589302769819, "grad_norm": 4.3479204177856445, "learning_rate": 1.3016689659629545e-08, "loss": 1.793, "step": 2029 }, { "epoch": 0.9694364851957975, "grad_norm": 4.509471893310547, "learning_rate": 1.262556256857006e-08, "loss": 1.2172, "step": 2030 }, { "epoch": 0.9699140401146131, "grad_norm": 4.267385482788086, "learning_rate": 1.2240386902699353e-08, "loss": 1.118, "step": 2031 }, { "epoch": 0.9703915950334289, "grad_norm": 4.339064121246338, "learning_rate": 1.1861163583609948e-08, "loss": 1.6117, "step": 2032 }, { "epoch": 0.9708691499522445, "grad_norm": 4.153913497924805, "learning_rate": 1.1487893518651871e-08, "loss": 1.2485, "step": 2033 }, { "epoch": 0.9713467048710601, "grad_norm": 4.864563465118408, "learning_rate": 1.1120577600930716e-08, "loss": 1.0407, "step": 2034 }, { "epoch": 0.9718242597898759, "grad_norm": 4.94449520111084, "learning_rate": 1.0759216709306242e-08, "loss": 1.557, "step": 2035 }, { "epoch": 0.9723018147086915, "grad_norm": 4.448370456695557, "learning_rate": 1.0403811708390165e-08, "loss": 1.7955, "step": 2036 }, { "epoch": 0.9727793696275072, "grad_norm": 5.577011585235596, "learning_rate": 1.0054363448543103e-08, "loss": 1.2425, "step": 2037 }, { "epoch": 0.9732569245463228, "grad_norm": 4.193991661071777, "learning_rate": 9.71087276587429e-09, "loss": 1.8259, "step": 2038 }, { "epoch": 0.9737344794651385, "grad_norm": 4.820496082305908, "learning_rate": 9.373340482237148e-09, "loss": 1.8384, "step": 2039 }, { "epoch": 0.9742120343839542, "grad_norm": 4.608538627624512, "learning_rate": 9.041767405229829e-09, "loss": 1.2179, "step": 2040 }, { "epoch": 0.9746895893027698, "grad_norm": 4.862077713012695, "learning_rate": 8.716154328192173e-09, "loss": 1.3559, "step": 2041 }, { "epoch": 0.9751671442215855, "grad_norm": 4.448031425476074, "learning_rate": 8.396502030202646e-09, "loss": 1.027, "step": 2042 }, { "epoch": 0.9756446991404012, "grad_norm": 5.102543830871582, "learning_rate": 8.082811276079184e-09, "loss": 1.5825, "step": 2043 }, { "epoch": 0.9761222540592168, "grad_norm": 3.781738758087158, "learning_rate": 7.775082816374735e-09, "loss": 1.1892, "step": 2044 }, { "epoch": 0.9765998089780324, "grad_norm": 4.259345531463623, "learning_rate": 7.473317387377e-09, "loss": 1.6128, "step": 2045 }, { "epoch": 0.9770773638968482, "grad_norm": Infinity, "learning_rate": 7.17751571110592e-09, "loss": 1.2994, "step": 2046 }, { "epoch": 0.9775549188156638, "grad_norm": 5.708920001983643, "learning_rate": 7.17751571110592e-09, "loss": 1.3329, "step": 2047 }, { "epoch": 0.9780324737344794, "grad_norm": 4.51529598236084, "learning_rate": 6.887678495312578e-09, "loss": 1.9155, "step": 2048 }, { "epoch": 0.9785100286532952, "grad_norm": 4.769768714904785, "learning_rate": 6.603806433476967e-09, "loss": 1.4738, "step": 2049 }, { "epoch": 0.9789875835721108, "grad_norm": 4.824976444244385, "learning_rate": 6.325900204806612e-09, "loss": 1.6707, "step": 2050 }, { "epoch": 0.9794651384909264, "grad_norm": 5.2741923332214355, "learning_rate": 6.0539604742346216e-09, "loss": 2.072, "step": 2051 }, { "epoch": 0.9799426934097422, "grad_norm": 5.99643611907959, "learning_rate": 5.787987892418856e-09, "loss": 1.0336, "step": 2052 }, { "epoch": 0.9804202483285578, "grad_norm": 5.193567752838135, "learning_rate": 5.527983095739431e-09, "loss": 1.1738, "step": 2053 }, { "epoch": 0.9808978032473734, "grad_norm": 5.036185264587402, "learning_rate": 5.273946706297606e-09, "loss": 1.6744, "step": 2054 }, { "epoch": 0.9813753581661891, "grad_norm": 4.673053741455078, "learning_rate": 5.025879331914396e-09, "loss": 0.9919, "step": 2055 }, { "epoch": 0.9818529130850048, "grad_norm": 5.152437210083008, "learning_rate": 4.783781566129464e-09, "loss": 1.0891, "step": 2056 }, { "epoch": 0.9823304680038204, "grad_norm": 4.375527858734131, "learning_rate": 4.547653988198619e-09, "loss": 1.9347, "step": 2057 }, { "epoch": 0.9828080229226361, "grad_norm": 3.890050172805786, "learning_rate": 4.317497163093265e-09, "loss": 1.8128, "step": 2058 }, { "epoch": 0.9832855778414518, "grad_norm": 4.778881549835205, "learning_rate": 4.093311641499009e-09, "loss": 1.4586, "step": 2059 }, { "epoch": 0.9837631327602674, "grad_norm": 4.171104431152344, "learning_rate": 3.8750979598140006e-09, "loss": 1.2397, "step": 2060 }, { "epoch": 0.9842406876790831, "grad_norm": 5.122347354888916, "learning_rate": 3.6628566401483712e-09, "loss": 1.6013, "step": 2061 }, { "epoch": 0.9847182425978988, "grad_norm": 4.823969841003418, "learning_rate": 3.4565881903217414e-09, "loss": 1.4571, "step": 2062 }, { "epoch": 0.9851957975167144, "grad_norm": 4.397780895233154, "learning_rate": 3.2562931038629395e-09, "loss": 0.9823, "step": 2063 }, { "epoch": 0.9856733524355301, "grad_norm": 4.821561336517334, "learning_rate": 3.0619718600091717e-09, "loss": 1.4069, "step": 2064 }, { "epoch": 0.9861509073543457, "grad_norm": 4.7277116775512695, "learning_rate": 2.8736249237032444e-09, "loss": 1.4566, "step": 2065 }, { "epoch": 0.9866284622731614, "grad_norm": 4.476099967956543, "learning_rate": 2.6912527455946745e-09, "loss": 1.0647, "step": 2066 }, { "epoch": 0.9871060171919771, "grad_norm": 4.993412971496582, "learning_rate": 2.514855762036639e-09, "loss": 1.3284, "step": 2067 }, { "epoch": 0.9875835721107927, "grad_norm": 5.218790054321289, "learning_rate": 2.3444343950859703e-09, "loss": 1.4102, "step": 2068 }, { "epoch": 0.9880611270296084, "grad_norm": 4.291370868682861, "learning_rate": 2.179989052501774e-09, "loss": 2.0856, "step": 2069 }, { "epoch": 0.9885386819484241, "grad_norm": 5.492192268371582, "learning_rate": 2.021520127745147e-09, "loss": 1.9195, "step": 2070 }, { "epoch": 0.9890162368672397, "grad_norm": 4.574422359466553, "learning_rate": 1.8690279999772355e-09, "loss": 1.327, "step": 2071 }, { "epoch": 0.9894937917860553, "grad_norm": 6.406982898712158, "learning_rate": 1.7225130340586815e-09, "loss": 1.5749, "step": 2072 }, { "epoch": 0.9899713467048711, "grad_norm": 5.211240291595459, "learning_rate": 1.5819755805490667e-09, "loss": 1.3367, "step": 2073 }, { "epoch": 0.9904489016236867, "grad_norm": 4.226899147033691, "learning_rate": 1.447415975706079e-09, "loss": 1.2819, "step": 2074 }, { "epoch": 0.9909264565425024, "grad_norm": 4.05258321762085, "learning_rate": 1.3188345414838487e-09, "loss": 1.5488, "step": 2075 }, { "epoch": 0.9914040114613181, "grad_norm": 4.930428981781006, "learning_rate": 1.1962315855335028e-09, "loss": 1.9734, "step": 2076 }, { "epoch": 0.9918815663801337, "grad_norm": 4.142110347747803, "learning_rate": 1.079607401201499e-09, "loss": 1.1493, "step": 2077 }, { "epoch": 0.9923591212989494, "grad_norm": 4.631879806518555, "learning_rate": 9.689622675287947e-10, "loss": 0.9029, "step": 2078 }, { "epoch": 0.9928366762177651, "grad_norm": 5.496512413024902, "learning_rate": 8.642964492508455e-10, "loss": 0.9594, "step": 2079 }, { "epoch": 0.9933142311365807, "grad_norm": 5.671935081481934, "learning_rate": 7.656101967970509e-10, "loss": 1.5715, "step": 2080 }, { "epoch": 0.9937917860553964, "grad_norm": 4.859366416931152, "learning_rate": 6.729037462890886e-10, "loss": 1.0306, "step": 2081 }, { "epoch": 0.994269340974212, "grad_norm": 4.688819408416748, "learning_rate": 5.8617731954147e-10, "loss": 1.4429, "step": 2082 }, { "epoch": 0.9947468958930277, "grad_norm": 5.08845329284668, "learning_rate": 5.054311240604292e-10, "loss": 1.3806, "step": 2083 }, { "epoch": 0.9952244508118434, "grad_norm": 4.913906097412109, "learning_rate": 4.306653530439242e-10, "loss": 1.5041, "step": 2084 }, { "epoch": 0.995702005730659, "grad_norm": 4.508459568023682, "learning_rate": 3.6188018538024784e-10, "loss": 2.0708, "step": 2085 }, { "epoch": 0.9961795606494747, "grad_norm": 4.158637523651123, "learning_rate": 2.9907578564858374e-10, "loss": 1.4236, "step": 2086 }, { "epoch": 0.9966571155682904, "grad_norm": 4.701148509979248, "learning_rate": 2.422523041178959e-10, "loss": 1.4894, "step": 2087 }, { "epoch": 0.997134670487106, "grad_norm": 4.8486647605896, "learning_rate": 1.9140987674748367e-10, "loss": 1.336, "step": 2088 }, { "epoch": 0.9976122254059216, "grad_norm": 3.2829272747039795, "learning_rate": 1.4654862518531653e-10, "loss": 0.6804, "step": 2089 }, { "epoch": 0.9980897803247374, "grad_norm": 4.794292449951172, "learning_rate": 1.0766865676886673e-10, "loss": 1.8295, "step": 2090 }, { "epoch": 0.998567335243553, "grad_norm": 4.592539310455322, "learning_rate": 7.477006452455416e-11, "loss": 1.4372, "step": 2091 }, { "epoch": 0.9990448901623686, "grad_norm": 4.298410415649414, "learning_rate": 4.785292716746881e-11, "loss": 1.1748, "step": 2092 }, { "epoch": 0.9995224450811844, "grad_norm": 4.077103137969971, "learning_rate": 2.6917309100538136e-11, "loss": 0.8696, "step": 2093 }, { "epoch": 1.0, "grad_norm": 5.0203537940979, "learning_rate": 1.1963260415637224e-11, "loss": 1.2283, "step": 2094 } ], "logging_steps": 1, "max_steps": 2094, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.278240588110234e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }