{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997860047078965, "eval_steps": 500, "global_step": 2920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034239246736571797, "grad_norm": 0.044989924877882004, "learning_rate": 4.998287671232877e-05, "loss": 1.8288, "step": 1 }, { "epoch": 0.0006847849347314359, "grad_norm": 0.04361194744706154, "learning_rate": 4.9965753424657535e-05, "loss": 1.8895, "step": 2 }, { "epoch": 0.001027177402097154, "grad_norm": 0.04726405069231987, "learning_rate": 4.99486301369863e-05, "loss": 1.9231, "step": 3 }, { "epoch": 0.0013695698694628719, "grad_norm": 0.05400270223617554, "learning_rate": 4.993150684931507e-05, "loss": 1.8677, "step": 4 }, { "epoch": 0.0017119623368285898, "grad_norm": 0.05831163004040718, "learning_rate": 4.991438356164384e-05, "loss": 1.894, "step": 5 }, { "epoch": 0.002054354804194308, "grad_norm": 0.060255009680986404, "learning_rate": 4.989726027397261e-05, "loss": 1.8533, "step": 6 }, { "epoch": 0.0023967472715600256, "grad_norm": 0.06046730652451515, "learning_rate": 4.9880136986301375e-05, "loss": 1.8439, "step": 7 }, { "epoch": 0.0027391397389257438, "grad_norm": 0.06515906006097794, "learning_rate": 4.986301369863014e-05, "loss": 1.8311, "step": 8 }, { "epoch": 0.0030815322062914615, "grad_norm": 0.06696534901857376, "learning_rate": 4.984589041095891e-05, "loss": 1.7224, "step": 9 }, { "epoch": 0.0034239246736571796, "grad_norm": 0.0728447362780571, "learning_rate": 4.9828767123287674e-05, "loss": 1.8323, "step": 10 }, { "epoch": 0.0037663171410228973, "grad_norm": 0.06521787494421005, "learning_rate": 4.981164383561644e-05, "loss": 1.8446, "step": 11 }, { "epoch": 0.004108709608388616, "grad_norm": 0.06902988255023956, "learning_rate": 4.979452054794521e-05, "loss": 1.9744, "step": 12 }, { "epoch": 0.0044511020757543335, "grad_norm": 0.08189862221479416, "learning_rate": 4.9777397260273974e-05, "loss": 1.8201, "step": 13 }, { "epoch": 0.004793494543120051, "grad_norm": 0.06420566141605377, "learning_rate": 4.976027397260275e-05, "loss": 1.7122, "step": 14 }, { "epoch": 0.005135887010485769, "grad_norm": 0.062082137912511826, "learning_rate": 4.9743150684931514e-05, "loss": 1.7122, "step": 15 }, { "epoch": 0.0054782794778514875, "grad_norm": 0.06442763656377792, "learning_rate": 4.972602739726028e-05, "loss": 1.6049, "step": 16 }, { "epoch": 0.005820671945217205, "grad_norm": 0.06138802319765091, "learning_rate": 4.970890410958905e-05, "loss": 1.7511, "step": 17 }, { "epoch": 0.006163064412582923, "grad_norm": 0.0657878965139389, "learning_rate": 4.969178082191781e-05, "loss": 1.8273, "step": 18 }, { "epoch": 0.0065054568799486415, "grad_norm": 0.06280089914798737, "learning_rate": 4.967465753424658e-05, "loss": 1.6886, "step": 19 }, { "epoch": 0.006847849347314359, "grad_norm": 0.051142092794179916, "learning_rate": 4.9657534246575346e-05, "loss": 1.7099, "step": 20 }, { "epoch": 0.007190241814680077, "grad_norm": 0.04863899201154709, "learning_rate": 4.964041095890411e-05, "loss": 1.7435, "step": 21 }, { "epoch": 0.0075326342820457946, "grad_norm": 0.04535438492894173, "learning_rate": 4.962328767123288e-05, "loss": 1.8073, "step": 22 }, { "epoch": 0.007875026749411512, "grad_norm": 0.041395559906959534, "learning_rate": 4.9606164383561646e-05, "loss": 1.7856, "step": 23 }, { "epoch": 0.008217419216777232, "grad_norm": 0.042886726558208466, "learning_rate": 4.958904109589041e-05, "loss": 1.8595, "step": 24 }, { "epoch": 0.00855981168414295, "grad_norm": 0.03878549858927727, "learning_rate": 4.957191780821918e-05, "loss": 1.7362, "step": 25 }, { "epoch": 0.008902204151508667, "grad_norm": 0.047440607100725174, "learning_rate": 4.9554794520547946e-05, "loss": 1.7164, "step": 26 }, { "epoch": 0.009244596618874385, "grad_norm": 0.03871932253241539, "learning_rate": 4.953767123287671e-05, "loss": 1.7105, "step": 27 }, { "epoch": 0.009586989086240102, "grad_norm": 0.04232623428106308, "learning_rate": 4.952054794520548e-05, "loss": 1.646, "step": 28 }, { "epoch": 0.00992938155360582, "grad_norm": 0.04165391996502876, "learning_rate": 4.9503424657534245e-05, "loss": 1.6921, "step": 29 }, { "epoch": 0.010271774020971538, "grad_norm": 0.03977831453084946, "learning_rate": 4.948630136986301e-05, "loss": 1.6299, "step": 30 }, { "epoch": 0.010614166488337257, "grad_norm": 0.0354389026761055, "learning_rate": 4.946917808219178e-05, "loss": 1.711, "step": 31 }, { "epoch": 0.010956558955702975, "grad_norm": 0.036333948373794556, "learning_rate": 4.945205479452055e-05, "loss": 1.7151, "step": 32 }, { "epoch": 0.011298951423068693, "grad_norm": 0.04054725542664528, "learning_rate": 4.943493150684932e-05, "loss": 1.6772, "step": 33 }, { "epoch": 0.01164134389043441, "grad_norm": 0.037162214517593384, "learning_rate": 4.9417808219178084e-05, "loss": 1.6444, "step": 34 }, { "epoch": 0.011983736357800128, "grad_norm": 0.05832388252019882, "learning_rate": 4.940068493150685e-05, "loss": 1.622, "step": 35 }, { "epoch": 0.012326128825165846, "grad_norm": 0.04889669269323349, "learning_rate": 4.938356164383562e-05, "loss": 1.7307, "step": 36 }, { "epoch": 0.012668521292531564, "grad_norm": 0.039188310503959656, "learning_rate": 4.9366438356164384e-05, "loss": 1.7082, "step": 37 }, { "epoch": 0.013010913759897283, "grad_norm": 0.037781357765197754, "learning_rate": 4.934931506849315e-05, "loss": 1.7741, "step": 38 }, { "epoch": 0.013353306227263, "grad_norm": 0.04408564418554306, "learning_rate": 4.933219178082192e-05, "loss": 1.6136, "step": 39 }, { "epoch": 0.013695698694628718, "grad_norm": 0.03870445489883423, "learning_rate": 4.9315068493150684e-05, "loss": 1.5931, "step": 40 }, { "epoch": 0.014038091161994436, "grad_norm": 0.04507477954030037, "learning_rate": 4.929794520547946e-05, "loss": 1.7544, "step": 41 }, { "epoch": 0.014380483629360154, "grad_norm": 0.039748795330524445, "learning_rate": 4.9280821917808223e-05, "loss": 1.6324, "step": 42 }, { "epoch": 0.014722876096725871, "grad_norm": 0.036341771483421326, "learning_rate": 4.926369863013699e-05, "loss": 1.7343, "step": 43 }, { "epoch": 0.015065268564091589, "grad_norm": 0.03865649178624153, "learning_rate": 4.9246575342465756e-05, "loss": 1.6902, "step": 44 }, { "epoch": 0.015407661031457309, "grad_norm": 0.038440950214862823, "learning_rate": 4.922945205479452e-05, "loss": 1.6704, "step": 45 }, { "epoch": 0.015750053498823025, "grad_norm": 0.0386311411857605, "learning_rate": 4.921232876712329e-05, "loss": 1.6987, "step": 46 }, { "epoch": 0.016092445966188744, "grad_norm": 0.04216955229640007, "learning_rate": 4.9195205479452056e-05, "loss": 1.8411, "step": 47 }, { "epoch": 0.016434838433554463, "grad_norm": 0.038235925137996674, "learning_rate": 4.917808219178082e-05, "loss": 1.7007, "step": 48 }, { "epoch": 0.01677723090092018, "grad_norm": 0.04005911946296692, "learning_rate": 4.9160958904109596e-05, "loss": 1.6546, "step": 49 }, { "epoch": 0.0171196233682859, "grad_norm": 0.04313301667571068, "learning_rate": 4.914383561643836e-05, "loss": 1.7734, "step": 50 }, { "epoch": 0.017462015835651615, "grad_norm": 0.041396427899599075, "learning_rate": 4.912671232876713e-05, "loss": 1.6739, "step": 51 }, { "epoch": 0.017804408303017334, "grad_norm": 0.04136648029088974, "learning_rate": 4.9109589041095895e-05, "loss": 1.6749, "step": 52 }, { "epoch": 0.01814680077038305, "grad_norm": 0.03975985571742058, "learning_rate": 4.909246575342466e-05, "loss": 1.7457, "step": 53 }, { "epoch": 0.01848919323774877, "grad_norm": 0.06187222898006439, "learning_rate": 4.907534246575343e-05, "loss": 1.7686, "step": 54 }, { "epoch": 0.01883158570511449, "grad_norm": 0.05079127103090286, "learning_rate": 4.9058219178082195e-05, "loss": 1.7249, "step": 55 }, { "epoch": 0.019173978172480205, "grad_norm": 0.043534740805625916, "learning_rate": 4.904109589041096e-05, "loss": 1.718, "step": 56 }, { "epoch": 0.019516370639845924, "grad_norm": 0.052709322422742844, "learning_rate": 4.902397260273973e-05, "loss": 1.6289, "step": 57 }, { "epoch": 0.01985876310721164, "grad_norm": 0.03929607942700386, "learning_rate": 4.90068493150685e-05, "loss": 1.8459, "step": 58 }, { "epoch": 0.02020115557457736, "grad_norm": 0.04129493609070778, "learning_rate": 4.898972602739727e-05, "loss": 1.8419, "step": 59 }, { "epoch": 0.020543548041943076, "grad_norm": 0.04822686314582825, "learning_rate": 4.8972602739726034e-05, "loss": 1.7731, "step": 60 }, { "epoch": 0.020885940509308795, "grad_norm": 0.05497492104768753, "learning_rate": 4.89554794520548e-05, "loss": 1.6869, "step": 61 }, { "epoch": 0.021228332976674515, "grad_norm": 0.05401976779103279, "learning_rate": 4.893835616438357e-05, "loss": 1.7714, "step": 62 }, { "epoch": 0.02157072544404023, "grad_norm": 0.061098527163267136, "learning_rate": 4.8921232876712334e-05, "loss": 1.7206, "step": 63 }, { "epoch": 0.02191311791140595, "grad_norm": 0.04993405565619469, "learning_rate": 4.89041095890411e-05, "loss": 1.7562, "step": 64 }, { "epoch": 0.022255510378771666, "grad_norm": 0.047674760222435, "learning_rate": 4.888698630136987e-05, "loss": 1.598, "step": 65 }, { "epoch": 0.022597902846137385, "grad_norm": 0.04686673730611801, "learning_rate": 4.8869863013698633e-05, "loss": 1.6847, "step": 66 }, { "epoch": 0.0229402953135031, "grad_norm": 0.05058549344539642, "learning_rate": 4.88527397260274e-05, "loss": 1.6378, "step": 67 }, { "epoch": 0.02328268778086882, "grad_norm": 0.062044333666563034, "learning_rate": 4.8835616438356167e-05, "loss": 1.6524, "step": 68 }, { "epoch": 0.02362508024823454, "grad_norm": 0.04315562546253204, "learning_rate": 4.881849315068493e-05, "loss": 1.6563, "step": 69 }, { "epoch": 0.023967472715600256, "grad_norm": 0.04709568992257118, "learning_rate": 4.88013698630137e-05, "loss": 1.5854, "step": 70 }, { "epoch": 0.024309865182965976, "grad_norm": 0.04821576923131943, "learning_rate": 4.8784246575342466e-05, "loss": 1.6929, "step": 71 }, { "epoch": 0.02465225765033169, "grad_norm": 0.04530227184295654, "learning_rate": 4.876712328767123e-05, "loss": 1.7395, "step": 72 }, { "epoch": 0.02499465011769741, "grad_norm": 0.05112622678279877, "learning_rate": 4.875e-05, "loss": 1.7844, "step": 73 }, { "epoch": 0.025337042585063127, "grad_norm": 0.046875759959220886, "learning_rate": 4.8732876712328766e-05, "loss": 1.7809, "step": 74 }, { "epoch": 0.025679435052428846, "grad_norm": 0.04776829108595848, "learning_rate": 4.871575342465753e-05, "loss": 1.6859, "step": 75 }, { "epoch": 0.026021827519794566, "grad_norm": 0.05074959993362427, "learning_rate": 4.8698630136986305e-05, "loss": 1.6416, "step": 76 }, { "epoch": 0.026364219987160282, "grad_norm": 0.04781273007392883, "learning_rate": 4.868150684931507e-05, "loss": 1.7135, "step": 77 }, { "epoch": 0.026706612454526, "grad_norm": 0.051603902131319046, "learning_rate": 4.866438356164384e-05, "loss": 1.5924, "step": 78 }, { "epoch": 0.027049004921891717, "grad_norm": 0.05113664269447327, "learning_rate": 4.8647260273972605e-05, "loss": 1.7243, "step": 79 }, { "epoch": 0.027391397389257437, "grad_norm": 0.048848897218704224, "learning_rate": 4.863013698630137e-05, "loss": 1.6385, "step": 80 }, { "epoch": 0.027733789856623153, "grad_norm": 0.049598220735788345, "learning_rate": 4.861301369863014e-05, "loss": 1.675, "step": 81 }, { "epoch": 0.028076182323988872, "grad_norm": 0.047608450055122375, "learning_rate": 4.8595890410958905e-05, "loss": 1.6868, "step": 82 }, { "epoch": 0.02841857479135459, "grad_norm": 0.04578407108783722, "learning_rate": 4.857876712328767e-05, "loss": 1.7441, "step": 83 }, { "epoch": 0.028760967258720307, "grad_norm": 0.05291769281029701, "learning_rate": 4.856164383561644e-05, "loss": 1.6903, "step": 84 }, { "epoch": 0.029103359726086027, "grad_norm": 0.051181238144636154, "learning_rate": 4.854452054794521e-05, "loss": 1.6583, "step": 85 }, { "epoch": 0.029445752193451743, "grad_norm": 0.05467492341995239, "learning_rate": 4.852739726027398e-05, "loss": 1.6411, "step": 86 }, { "epoch": 0.029788144660817462, "grad_norm": 0.056172505021095276, "learning_rate": 4.8510273972602744e-05, "loss": 1.6881, "step": 87 }, { "epoch": 0.030130537128183178, "grad_norm": 0.05360399931669235, "learning_rate": 4.849315068493151e-05, "loss": 1.7309, "step": 88 }, { "epoch": 0.030472929595548898, "grad_norm": 0.05739091336727142, "learning_rate": 4.847602739726028e-05, "loss": 1.6496, "step": 89 }, { "epoch": 0.030815322062914617, "grad_norm": 0.06494379043579102, "learning_rate": 4.8458904109589044e-05, "loss": 1.5089, "step": 90 }, { "epoch": 0.031157714530280333, "grad_norm": 0.05443499982357025, "learning_rate": 4.844178082191781e-05, "loss": 1.6422, "step": 91 }, { "epoch": 0.03150010699764605, "grad_norm": 0.0603407546877861, "learning_rate": 4.8424657534246577e-05, "loss": 1.6706, "step": 92 }, { "epoch": 0.03184249946501177, "grad_norm": 0.058132074773311615, "learning_rate": 4.840753424657534e-05, "loss": 1.7117, "step": 93 }, { "epoch": 0.03218489193237749, "grad_norm": 0.06787832826375961, "learning_rate": 4.8390410958904116e-05, "loss": 1.6994, "step": 94 }, { "epoch": 0.03252728439974321, "grad_norm": 0.051144689321517944, "learning_rate": 4.837328767123288e-05, "loss": 1.6373, "step": 95 }, { "epoch": 0.03286967686710893, "grad_norm": 0.052316680550575256, "learning_rate": 4.835616438356165e-05, "loss": 1.6943, "step": 96 }, { "epoch": 0.03321206933447464, "grad_norm": 0.06200467795133591, "learning_rate": 4.8339041095890416e-05, "loss": 1.7685, "step": 97 }, { "epoch": 0.03355446180184036, "grad_norm": 0.06054258346557617, "learning_rate": 4.832191780821918e-05, "loss": 1.7789, "step": 98 }, { "epoch": 0.03389685426920608, "grad_norm": 0.05892505496740341, "learning_rate": 4.830479452054795e-05, "loss": 1.6286, "step": 99 }, { "epoch": 0.0342392467365718, "grad_norm": 0.06003416329622269, "learning_rate": 4.8287671232876716e-05, "loss": 1.6932, "step": 100 }, { "epoch": 0.03458163920393751, "grad_norm": 0.05738883838057518, "learning_rate": 4.827054794520548e-05, "loss": 1.6171, "step": 101 }, { "epoch": 0.03492403167130323, "grad_norm": 0.053451597690582275, "learning_rate": 4.825342465753425e-05, "loss": 1.7531, "step": 102 }, { "epoch": 0.03526642413866895, "grad_norm": 0.06163148581981659, "learning_rate": 4.8236301369863015e-05, "loss": 1.7133, "step": 103 }, { "epoch": 0.03560881660603467, "grad_norm": 0.16779644787311554, "learning_rate": 4.821917808219178e-05, "loss": 1.6717, "step": 104 }, { "epoch": 0.03595120907340039, "grad_norm": 0.05898606404662132, "learning_rate": 4.820205479452055e-05, "loss": 1.6451, "step": 105 }, { "epoch": 0.0362936015407661, "grad_norm": 0.055019885301589966, "learning_rate": 4.8184931506849315e-05, "loss": 1.7328, "step": 106 }, { "epoch": 0.03663599400813182, "grad_norm": 0.0559951514005661, "learning_rate": 4.816780821917808e-05, "loss": 1.7777, "step": 107 }, { "epoch": 0.03697838647549754, "grad_norm": 0.05110043287277222, "learning_rate": 4.815068493150685e-05, "loss": 1.6522, "step": 108 }, { "epoch": 0.03732077894286326, "grad_norm": 0.06068919971585274, "learning_rate": 4.8133561643835614e-05, "loss": 1.6044, "step": 109 }, { "epoch": 0.03766317141022898, "grad_norm": 0.05809701979160309, "learning_rate": 4.811643835616438e-05, "loss": 1.5902, "step": 110 }, { "epoch": 0.03800556387759469, "grad_norm": 0.05732065439224243, "learning_rate": 4.809931506849315e-05, "loss": 1.7776, "step": 111 }, { "epoch": 0.03834795634496041, "grad_norm": 0.06103517487645149, "learning_rate": 4.808219178082192e-05, "loss": 1.6509, "step": 112 }, { "epoch": 0.03869034881232613, "grad_norm": 0.06855235248804092, "learning_rate": 4.806506849315069e-05, "loss": 1.7284, "step": 113 }, { "epoch": 0.03903274127969185, "grad_norm": 0.058819565922021866, "learning_rate": 4.8047945205479454e-05, "loss": 1.6754, "step": 114 }, { "epoch": 0.03937513374705756, "grad_norm": 0.06472082436084747, "learning_rate": 4.803082191780822e-05, "loss": 1.6985, "step": 115 }, { "epoch": 0.03971752621442328, "grad_norm": 0.08509037643671036, "learning_rate": 4.801369863013699e-05, "loss": 1.5554, "step": 116 }, { "epoch": 0.040059918681789, "grad_norm": 0.05812961235642433, "learning_rate": 4.799657534246575e-05, "loss": 1.6599, "step": 117 }, { "epoch": 0.04040231114915472, "grad_norm": 0.0704309344291687, "learning_rate": 4.797945205479452e-05, "loss": 1.5584, "step": 118 }, { "epoch": 0.04074470361652044, "grad_norm": 0.06815467774868011, "learning_rate": 4.7962328767123286e-05, "loss": 1.7172, "step": 119 }, { "epoch": 0.04108709608388615, "grad_norm": 0.06125577911734581, "learning_rate": 4.794520547945205e-05, "loss": 1.6545, "step": 120 }, { "epoch": 0.04142948855125187, "grad_norm": 0.06108137592673302, "learning_rate": 4.7928082191780826e-05, "loss": 1.7255, "step": 121 }, { "epoch": 0.04177188101861759, "grad_norm": 0.057873982936143875, "learning_rate": 4.791095890410959e-05, "loss": 1.6791, "step": 122 }, { "epoch": 0.04211427348598331, "grad_norm": 0.061619874089956284, "learning_rate": 4.789383561643836e-05, "loss": 1.7557, "step": 123 }, { "epoch": 0.04245666595334903, "grad_norm": 0.06421203911304474, "learning_rate": 4.7876712328767126e-05, "loss": 1.7866, "step": 124 }, { "epoch": 0.04279905842071474, "grad_norm": 0.07434695959091187, "learning_rate": 4.785958904109589e-05, "loss": 1.5494, "step": 125 }, { "epoch": 0.04314145088808046, "grad_norm": 0.06087030470371246, "learning_rate": 4.784246575342466e-05, "loss": 1.5731, "step": 126 }, { "epoch": 0.04348384335544618, "grad_norm": 0.060027774423360825, "learning_rate": 4.7825342465753425e-05, "loss": 1.718, "step": 127 }, { "epoch": 0.0438262358228119, "grad_norm": 0.06149870157241821, "learning_rate": 4.780821917808219e-05, "loss": 1.6425, "step": 128 }, { "epoch": 0.04416862829017762, "grad_norm": 0.06767728179693222, "learning_rate": 4.779109589041096e-05, "loss": 1.6179, "step": 129 }, { "epoch": 0.04451102075754333, "grad_norm": 0.06106947734951973, "learning_rate": 4.777397260273973e-05, "loss": 1.681, "step": 130 }, { "epoch": 0.04485341322490905, "grad_norm": 0.05995972454547882, "learning_rate": 4.77568493150685e-05, "loss": 1.7151, "step": 131 }, { "epoch": 0.04519580569227477, "grad_norm": 0.06545582413673401, "learning_rate": 4.7739726027397265e-05, "loss": 1.6981, "step": 132 }, { "epoch": 0.04553819815964049, "grad_norm": 0.06059694290161133, "learning_rate": 4.772260273972603e-05, "loss": 1.6698, "step": 133 }, { "epoch": 0.0458805906270062, "grad_norm": 0.07256224751472473, "learning_rate": 4.77054794520548e-05, "loss": 1.6813, "step": 134 }, { "epoch": 0.04622298309437192, "grad_norm": 0.06272713840007782, "learning_rate": 4.7688356164383564e-05, "loss": 1.5688, "step": 135 }, { "epoch": 0.04656537556173764, "grad_norm": 0.07193107903003693, "learning_rate": 4.767123287671233e-05, "loss": 1.6795, "step": 136 }, { "epoch": 0.04690776802910336, "grad_norm": 0.0784289538860321, "learning_rate": 4.76541095890411e-05, "loss": 1.731, "step": 137 }, { "epoch": 0.04725016049646908, "grad_norm": 0.06658574938774109, "learning_rate": 4.763698630136987e-05, "loss": 1.7137, "step": 138 }, { "epoch": 0.04759255296383479, "grad_norm": 0.06280863285064697, "learning_rate": 4.761986301369864e-05, "loss": 1.7219, "step": 139 }, { "epoch": 0.04793494543120051, "grad_norm": 0.06461343169212341, "learning_rate": 4.7602739726027403e-05, "loss": 1.6748, "step": 140 }, { "epoch": 0.04827733789856623, "grad_norm": 0.07128650695085526, "learning_rate": 4.758561643835617e-05, "loss": 1.7625, "step": 141 }, { "epoch": 0.04861973036593195, "grad_norm": 0.06003497913479805, "learning_rate": 4.7568493150684937e-05, "loss": 1.7069, "step": 142 }, { "epoch": 0.04896212283329767, "grad_norm": 0.07442709058523178, "learning_rate": 4.75513698630137e-05, "loss": 1.7282, "step": 143 }, { "epoch": 0.04930451530066338, "grad_norm": 0.06686113029718399, "learning_rate": 4.753424657534247e-05, "loss": 1.719, "step": 144 }, { "epoch": 0.0496469077680291, "grad_norm": 0.07473582029342651, "learning_rate": 4.7517123287671236e-05, "loss": 1.5064, "step": 145 }, { "epoch": 0.04998930023539482, "grad_norm": 0.06369954347610474, "learning_rate": 4.75e-05, "loss": 1.7985, "step": 146 }, { "epoch": 0.05033169270276054, "grad_norm": 0.06884881854057312, "learning_rate": 4.748287671232877e-05, "loss": 1.6698, "step": 147 }, { "epoch": 0.050674085170126254, "grad_norm": 0.06404735147953033, "learning_rate": 4.7465753424657536e-05, "loss": 1.7071, "step": 148 }, { "epoch": 0.05101647763749197, "grad_norm": 0.319232702255249, "learning_rate": 4.74486301369863e-05, "loss": 1.5672, "step": 149 }, { "epoch": 0.05135887010485769, "grad_norm": 0.0745416209101677, "learning_rate": 4.743150684931507e-05, "loss": 1.6196, "step": 150 }, { "epoch": 0.05170126257222341, "grad_norm": 0.06818738579750061, "learning_rate": 4.7414383561643835e-05, "loss": 1.5583, "step": 151 }, { "epoch": 0.05204365503958913, "grad_norm": 0.06681168079376221, "learning_rate": 4.73972602739726e-05, "loss": 1.6892, "step": 152 }, { "epoch": 0.052386047506954844, "grad_norm": 0.06684134155511856, "learning_rate": 4.738013698630137e-05, "loss": 1.6204, "step": 153 }, { "epoch": 0.052728439974320564, "grad_norm": 0.06974063068628311, "learning_rate": 4.7363013698630135e-05, "loss": 1.7142, "step": 154 }, { "epoch": 0.05307083244168628, "grad_norm": 0.0697052851319313, "learning_rate": 4.73458904109589e-05, "loss": 1.7092, "step": 155 }, { "epoch": 0.053413224909052, "grad_norm": 0.07622416317462921, "learning_rate": 4.7328767123287675e-05, "loss": 1.7817, "step": 156 }, { "epoch": 0.05375561737641772, "grad_norm": 0.06680024415254593, "learning_rate": 4.731164383561644e-05, "loss": 1.6472, "step": 157 }, { "epoch": 0.054098009843783434, "grad_norm": 0.07718981802463531, "learning_rate": 4.729452054794521e-05, "loss": 1.7127, "step": 158 }, { "epoch": 0.054440402311149154, "grad_norm": 0.08253922313451767, "learning_rate": 4.7277397260273974e-05, "loss": 1.6536, "step": 159 }, { "epoch": 0.05478279477851487, "grad_norm": 0.07616232335567474, "learning_rate": 4.726027397260274e-05, "loss": 1.5759, "step": 160 }, { "epoch": 0.05512518724588059, "grad_norm": 0.07066119462251663, "learning_rate": 4.724315068493151e-05, "loss": 1.6764, "step": 161 }, { "epoch": 0.055467579713246305, "grad_norm": 0.07212837040424347, "learning_rate": 4.7226027397260274e-05, "loss": 1.7029, "step": 162 }, { "epoch": 0.055809972180612025, "grad_norm": 0.0749269500374794, "learning_rate": 4.720890410958904e-05, "loss": 1.6925, "step": 163 }, { "epoch": 0.056152364647977744, "grad_norm": 0.079817034304142, "learning_rate": 4.719178082191781e-05, "loss": 1.6371, "step": 164 }, { "epoch": 0.056494757115343464, "grad_norm": 0.07521949708461761, "learning_rate": 4.717465753424658e-05, "loss": 1.7268, "step": 165 }, { "epoch": 0.05683714958270918, "grad_norm": 0.07110805064439774, "learning_rate": 4.7157534246575347e-05, "loss": 1.5868, "step": 166 }, { "epoch": 0.057179542050074895, "grad_norm": 0.07461591809988022, "learning_rate": 4.714041095890411e-05, "loss": 1.6549, "step": 167 }, { "epoch": 0.057521934517440615, "grad_norm": 0.07838895171880722, "learning_rate": 4.712328767123288e-05, "loss": 1.5814, "step": 168 }, { "epoch": 0.057864326984806334, "grad_norm": 0.07968944311141968, "learning_rate": 4.7106164383561646e-05, "loss": 1.7346, "step": 169 }, { "epoch": 0.058206719452172054, "grad_norm": 0.07520300149917603, "learning_rate": 4.708904109589041e-05, "loss": 1.7332, "step": 170 }, { "epoch": 0.05854911191953777, "grad_norm": 0.08485481142997742, "learning_rate": 4.707191780821918e-05, "loss": 1.5215, "step": 171 }, { "epoch": 0.058891504386903486, "grad_norm": 0.07591280341148376, "learning_rate": 4.7054794520547946e-05, "loss": 1.6778, "step": 172 }, { "epoch": 0.059233896854269205, "grad_norm": 0.0666799545288086, "learning_rate": 4.703767123287671e-05, "loss": 1.6996, "step": 173 }, { "epoch": 0.059576289321634925, "grad_norm": 0.0809159204363823, "learning_rate": 4.7020547945205486e-05, "loss": 1.7008, "step": 174 }, { "epoch": 0.059918681789000644, "grad_norm": 0.07135330140590668, "learning_rate": 4.700342465753425e-05, "loss": 1.689, "step": 175 }, { "epoch": 0.060261074256366357, "grad_norm": 0.07072730362415314, "learning_rate": 4.698630136986302e-05, "loss": 1.6713, "step": 176 }, { "epoch": 0.060603466723732076, "grad_norm": 0.07544711977243423, "learning_rate": 4.6969178082191785e-05, "loss": 1.6209, "step": 177 }, { "epoch": 0.060945859191097795, "grad_norm": 0.0747295618057251, "learning_rate": 4.695205479452055e-05, "loss": 1.6755, "step": 178 }, { "epoch": 0.061288251658463515, "grad_norm": 0.08111903071403503, "learning_rate": 4.693493150684932e-05, "loss": 1.7464, "step": 179 }, { "epoch": 0.061630644125829234, "grad_norm": 0.0905933529138565, "learning_rate": 4.6917808219178085e-05, "loss": 1.5047, "step": 180 }, { "epoch": 0.06197303659319495, "grad_norm": 0.07301922142505646, "learning_rate": 4.690068493150685e-05, "loss": 1.6021, "step": 181 }, { "epoch": 0.062315429060560666, "grad_norm": 0.08234768360853195, "learning_rate": 4.688356164383562e-05, "loss": 1.7168, "step": 182 }, { "epoch": 0.06265782152792639, "grad_norm": 0.07387898862361908, "learning_rate": 4.686643835616439e-05, "loss": 1.7891, "step": 183 }, { "epoch": 0.0630002139952921, "grad_norm": 0.08666734397411346, "learning_rate": 4.684931506849316e-05, "loss": 1.6614, "step": 184 }, { "epoch": 0.06334260646265782, "grad_norm": 0.07451646775007248, "learning_rate": 4.6832191780821924e-05, "loss": 1.7663, "step": 185 }, { "epoch": 0.06368499893002354, "grad_norm": 0.07771901786327362, "learning_rate": 4.681506849315069e-05, "loss": 1.6811, "step": 186 }, { "epoch": 0.06402739139738926, "grad_norm": 0.07586071640253067, "learning_rate": 4.679794520547946e-05, "loss": 1.6689, "step": 187 }, { "epoch": 0.06436978386475498, "grad_norm": 0.0800698846578598, "learning_rate": 4.6780821917808224e-05, "loss": 1.5705, "step": 188 }, { "epoch": 0.06471217633212069, "grad_norm": 0.07333248853683472, "learning_rate": 4.676369863013699e-05, "loss": 1.725, "step": 189 }, { "epoch": 0.06505456879948641, "grad_norm": 0.10060349851846695, "learning_rate": 4.674657534246576e-05, "loss": 1.5723, "step": 190 }, { "epoch": 0.06539696126685213, "grad_norm": 0.07364695519208908, "learning_rate": 4.672945205479452e-05, "loss": 1.5372, "step": 191 }, { "epoch": 0.06573935373421785, "grad_norm": 0.08228003978729248, "learning_rate": 4.671232876712329e-05, "loss": 1.6038, "step": 192 }, { "epoch": 0.06608174620158357, "grad_norm": 0.08212967216968536, "learning_rate": 4.6695205479452056e-05, "loss": 1.6189, "step": 193 }, { "epoch": 0.06642413866894928, "grad_norm": 0.08098340779542923, "learning_rate": 4.667808219178082e-05, "loss": 1.6018, "step": 194 }, { "epoch": 0.066766531136315, "grad_norm": 0.07250115275382996, "learning_rate": 4.666095890410959e-05, "loss": 1.6771, "step": 195 }, { "epoch": 0.06710892360368072, "grad_norm": 0.0775756984949112, "learning_rate": 4.6643835616438356e-05, "loss": 1.6629, "step": 196 }, { "epoch": 0.06745131607104644, "grad_norm": 0.07581441104412079, "learning_rate": 4.662671232876712e-05, "loss": 1.5676, "step": 197 }, { "epoch": 0.06779370853841216, "grad_norm": 0.07663130015134811, "learning_rate": 4.660958904109589e-05, "loss": 1.5879, "step": 198 }, { "epoch": 0.06813610100577787, "grad_norm": 0.08263234049081802, "learning_rate": 4.6592465753424655e-05, "loss": 1.5975, "step": 199 }, { "epoch": 0.0684784934731436, "grad_norm": 0.0770161971449852, "learning_rate": 4.657534246575342e-05, "loss": 1.6743, "step": 200 }, { "epoch": 0.06882088594050931, "grad_norm": 0.08472053706645966, "learning_rate": 4.6558219178082195e-05, "loss": 1.7073, "step": 201 }, { "epoch": 0.06916327840787502, "grad_norm": 0.08434753865003586, "learning_rate": 4.654109589041096e-05, "loss": 1.6879, "step": 202 }, { "epoch": 0.06950567087524075, "grad_norm": 0.0869380533695221, "learning_rate": 4.652397260273973e-05, "loss": 1.6419, "step": 203 }, { "epoch": 0.06984806334260646, "grad_norm": 0.0837620347738266, "learning_rate": 4.6506849315068495e-05, "loss": 1.7286, "step": 204 }, { "epoch": 0.07019045580997219, "grad_norm": 0.08047568053007126, "learning_rate": 4.648972602739726e-05, "loss": 1.7161, "step": 205 }, { "epoch": 0.0705328482773379, "grad_norm": 0.0768761858344078, "learning_rate": 4.647260273972603e-05, "loss": 1.6777, "step": 206 }, { "epoch": 0.07087524074470361, "grad_norm": 0.0845128521323204, "learning_rate": 4.6455479452054794e-05, "loss": 1.6839, "step": 207 }, { "epoch": 0.07121763321206934, "grad_norm": 0.08565199375152588, "learning_rate": 4.643835616438356e-05, "loss": 1.6751, "step": 208 }, { "epoch": 0.07156002567943505, "grad_norm": 0.08827394992113113, "learning_rate": 4.642123287671233e-05, "loss": 1.6061, "step": 209 }, { "epoch": 0.07190241814680078, "grad_norm": 0.08402001857757568, "learning_rate": 4.64041095890411e-05, "loss": 1.7034, "step": 210 }, { "epoch": 0.07224481061416649, "grad_norm": 0.07990819960832596, "learning_rate": 4.638698630136987e-05, "loss": 1.7184, "step": 211 }, { "epoch": 0.0725872030815322, "grad_norm": 0.08350301533937454, "learning_rate": 4.6369863013698634e-05, "loss": 1.759, "step": 212 }, { "epoch": 0.07292959554889793, "grad_norm": 0.07997184991836548, "learning_rate": 4.63527397260274e-05, "loss": 1.6276, "step": 213 }, { "epoch": 0.07327198801626364, "grad_norm": 0.08245536684989929, "learning_rate": 4.633561643835617e-05, "loss": 1.7333, "step": 214 }, { "epoch": 0.07361438048362937, "grad_norm": 0.08825383335351944, "learning_rate": 4.631849315068493e-05, "loss": 1.6193, "step": 215 }, { "epoch": 0.07395677295099508, "grad_norm": 0.08539293706417084, "learning_rate": 4.63013698630137e-05, "loss": 1.6916, "step": 216 }, { "epoch": 0.07429916541836079, "grad_norm": 0.08715063333511353, "learning_rate": 4.6284246575342466e-05, "loss": 1.636, "step": 217 }, { "epoch": 0.07464155788572652, "grad_norm": 0.08656910061836243, "learning_rate": 4.626712328767123e-05, "loss": 1.655, "step": 218 }, { "epoch": 0.07498395035309223, "grad_norm": 0.10153871029615402, "learning_rate": 4.6250000000000006e-05, "loss": 1.7106, "step": 219 }, { "epoch": 0.07532634282045796, "grad_norm": 0.07660843431949615, "learning_rate": 4.623287671232877e-05, "loss": 1.6905, "step": 220 }, { "epoch": 0.07566873528782367, "grad_norm": 0.0857987105846405, "learning_rate": 4.621575342465754e-05, "loss": 1.7161, "step": 221 }, { "epoch": 0.07601112775518938, "grad_norm": 0.08070608228445053, "learning_rate": 4.6198630136986306e-05, "loss": 1.6952, "step": 222 }, { "epoch": 0.07635352022255511, "grad_norm": 0.08411615341901779, "learning_rate": 4.618150684931507e-05, "loss": 1.6654, "step": 223 }, { "epoch": 0.07669591268992082, "grad_norm": 0.0948730930685997, "learning_rate": 4.616438356164384e-05, "loss": 1.7109, "step": 224 }, { "epoch": 0.07703830515728655, "grad_norm": 0.07998296618461609, "learning_rate": 4.6147260273972605e-05, "loss": 1.6495, "step": 225 }, { "epoch": 0.07738069762465226, "grad_norm": 0.08145549148321152, "learning_rate": 4.613013698630137e-05, "loss": 1.6614, "step": 226 }, { "epoch": 0.07772309009201797, "grad_norm": 0.08269551396369934, "learning_rate": 4.611301369863014e-05, "loss": 1.6853, "step": 227 }, { "epoch": 0.0780654825593837, "grad_norm": 0.08959392458200455, "learning_rate": 4.609589041095891e-05, "loss": 1.6599, "step": 228 }, { "epoch": 0.07840787502674941, "grad_norm": 0.0955914631485939, "learning_rate": 4.607876712328768e-05, "loss": 1.6579, "step": 229 }, { "epoch": 0.07875026749411512, "grad_norm": 0.08753375709056854, "learning_rate": 4.6061643835616445e-05, "loss": 1.5608, "step": 230 }, { "epoch": 0.07909265996148085, "grad_norm": 0.08423955738544464, "learning_rate": 4.604452054794521e-05, "loss": 1.6967, "step": 231 }, { "epoch": 0.07943505242884656, "grad_norm": 0.08246470242738724, "learning_rate": 4.602739726027398e-05, "loss": 1.6432, "step": 232 }, { "epoch": 0.07977744489621229, "grad_norm": 0.09250756353139877, "learning_rate": 4.6010273972602744e-05, "loss": 1.714, "step": 233 }, { "epoch": 0.080119837363578, "grad_norm": 0.08269499242305756, "learning_rate": 4.599315068493151e-05, "loss": 1.6738, "step": 234 }, { "epoch": 0.08046222983094371, "grad_norm": 0.1003844365477562, "learning_rate": 4.597602739726028e-05, "loss": 1.6809, "step": 235 }, { "epoch": 0.08080462229830944, "grad_norm": 0.08862510323524475, "learning_rate": 4.5958904109589044e-05, "loss": 1.7032, "step": 236 }, { "epoch": 0.08114701476567515, "grad_norm": 0.07860508561134338, "learning_rate": 4.594178082191781e-05, "loss": 1.6185, "step": 237 }, { "epoch": 0.08148940723304088, "grad_norm": 0.0885242223739624, "learning_rate": 4.592465753424658e-05, "loss": 1.6109, "step": 238 }, { "epoch": 0.08183179970040659, "grad_norm": 0.08526121824979782, "learning_rate": 4.590753424657534e-05, "loss": 1.6212, "step": 239 }, { "epoch": 0.0821741921677723, "grad_norm": 0.09021202474832535, "learning_rate": 4.589041095890411e-05, "loss": 1.7193, "step": 240 }, { "epoch": 0.08251658463513803, "grad_norm": 0.08622259646654129, "learning_rate": 4.5873287671232876e-05, "loss": 1.7525, "step": 241 }, { "epoch": 0.08285897710250374, "grad_norm": 0.08443211019039154, "learning_rate": 4.585616438356164e-05, "loss": 1.6243, "step": 242 }, { "epoch": 0.08320136956986947, "grad_norm": 0.07668071240186691, "learning_rate": 4.583904109589041e-05, "loss": 1.7399, "step": 243 }, { "epoch": 0.08354376203723518, "grad_norm": 0.0885561928153038, "learning_rate": 4.5821917808219176e-05, "loss": 1.6334, "step": 244 }, { "epoch": 0.0838861545046009, "grad_norm": 0.0879499539732933, "learning_rate": 4.580479452054795e-05, "loss": 1.7857, "step": 245 }, { "epoch": 0.08422854697196662, "grad_norm": 0.08238277584314346, "learning_rate": 4.5787671232876716e-05, "loss": 1.6515, "step": 246 }, { "epoch": 0.08457093943933233, "grad_norm": 0.08999462425708771, "learning_rate": 4.577054794520548e-05, "loss": 1.6963, "step": 247 }, { "epoch": 0.08491333190669806, "grad_norm": 0.09063596278429031, "learning_rate": 4.575342465753425e-05, "loss": 1.7948, "step": 248 }, { "epoch": 0.08525572437406377, "grad_norm": 0.08319511264562607, "learning_rate": 4.5736301369863015e-05, "loss": 1.6624, "step": 249 }, { "epoch": 0.08559811684142948, "grad_norm": 0.08494985103607178, "learning_rate": 4.571917808219178e-05, "loss": 1.5952, "step": 250 }, { "epoch": 0.08594050930879521, "grad_norm": 0.08327984064817429, "learning_rate": 4.570205479452055e-05, "loss": 1.5285, "step": 251 }, { "epoch": 0.08628290177616092, "grad_norm": 0.08863556385040283, "learning_rate": 4.5684931506849315e-05, "loss": 1.5871, "step": 252 }, { "epoch": 0.08662529424352665, "grad_norm": 0.09314607828855515, "learning_rate": 4.566780821917808e-05, "loss": 1.6743, "step": 253 }, { "epoch": 0.08696768671089236, "grad_norm": 0.08039966970682144, "learning_rate": 4.5650684931506855e-05, "loss": 1.6403, "step": 254 }, { "epoch": 0.08731007917825807, "grad_norm": 0.09095503389835358, "learning_rate": 4.563356164383562e-05, "loss": 1.6801, "step": 255 }, { "epoch": 0.0876524716456238, "grad_norm": 0.08547984063625336, "learning_rate": 4.561643835616439e-05, "loss": 1.7187, "step": 256 }, { "epoch": 0.08799486411298951, "grad_norm": 0.08756761997938156, "learning_rate": 4.5599315068493154e-05, "loss": 1.6983, "step": 257 }, { "epoch": 0.08833725658035524, "grad_norm": 0.08615265041589737, "learning_rate": 4.558219178082192e-05, "loss": 1.6903, "step": 258 }, { "epoch": 0.08867964904772095, "grad_norm": 0.09364872425794601, "learning_rate": 4.556506849315069e-05, "loss": 1.6351, "step": 259 }, { "epoch": 0.08902204151508666, "grad_norm": 0.09238329529762268, "learning_rate": 4.5547945205479454e-05, "loss": 1.693, "step": 260 }, { "epoch": 0.08936443398245239, "grad_norm": 0.08715078234672546, "learning_rate": 4.553082191780822e-05, "loss": 1.5817, "step": 261 }, { "epoch": 0.0897068264498181, "grad_norm": 0.09295285493135452, "learning_rate": 4.551369863013699e-05, "loss": 1.7116, "step": 262 }, { "epoch": 0.09004921891718382, "grad_norm": 0.08881011605262756, "learning_rate": 4.549657534246576e-05, "loss": 1.713, "step": 263 }, { "epoch": 0.09039161138454954, "grad_norm": 0.10392383486032486, "learning_rate": 4.547945205479453e-05, "loss": 1.6282, "step": 264 }, { "epoch": 0.09073400385191525, "grad_norm": 0.09665607661008835, "learning_rate": 4.546232876712329e-05, "loss": 1.7091, "step": 265 }, { "epoch": 0.09107639631928098, "grad_norm": 0.09402202069759369, "learning_rate": 4.544520547945206e-05, "loss": 1.6034, "step": 266 }, { "epoch": 0.09141878878664669, "grad_norm": 0.09518489241600037, "learning_rate": 4.5428082191780826e-05, "loss": 1.5805, "step": 267 }, { "epoch": 0.0917611812540124, "grad_norm": 0.0898948684334755, "learning_rate": 4.541095890410959e-05, "loss": 1.6471, "step": 268 }, { "epoch": 0.09210357372137813, "grad_norm": 0.09068996459245682, "learning_rate": 4.539383561643836e-05, "loss": 1.6688, "step": 269 }, { "epoch": 0.09244596618874384, "grad_norm": 0.0924811139702797, "learning_rate": 4.5376712328767126e-05, "loss": 1.6209, "step": 270 }, { "epoch": 0.09278835865610957, "grad_norm": 0.10105038434267044, "learning_rate": 4.535958904109589e-05, "loss": 1.6373, "step": 271 }, { "epoch": 0.09313075112347528, "grad_norm": 0.10043451935052872, "learning_rate": 4.534246575342466e-05, "loss": 1.8096, "step": 272 }, { "epoch": 0.093473143590841, "grad_norm": 0.08879895508289337, "learning_rate": 4.5325342465753425e-05, "loss": 1.5759, "step": 273 }, { "epoch": 0.09381553605820672, "grad_norm": 0.0850725769996643, "learning_rate": 4.530821917808219e-05, "loss": 1.6032, "step": 274 }, { "epoch": 0.09415792852557243, "grad_norm": 0.10312818735837936, "learning_rate": 4.529109589041096e-05, "loss": 1.71, "step": 275 }, { "epoch": 0.09450032099293816, "grad_norm": 0.08802473545074463, "learning_rate": 4.5273972602739725e-05, "loss": 1.7072, "step": 276 }, { "epoch": 0.09484271346030387, "grad_norm": 0.08519906550645828, "learning_rate": 4.525684931506849e-05, "loss": 1.6832, "step": 277 }, { "epoch": 0.09518510592766959, "grad_norm": 0.10426424443721771, "learning_rate": 4.523972602739726e-05, "loss": 1.758, "step": 278 }, { "epoch": 0.09552749839503531, "grad_norm": 0.09052019566297531, "learning_rate": 4.5222602739726025e-05, "loss": 1.6818, "step": 279 }, { "epoch": 0.09586989086240102, "grad_norm": 0.08638709783554077, "learning_rate": 4.520547945205479e-05, "loss": 1.6661, "step": 280 }, { "epoch": 0.09621228332976675, "grad_norm": 0.09282395243644714, "learning_rate": 4.5188356164383564e-05, "loss": 1.6368, "step": 281 }, { "epoch": 0.09655467579713246, "grad_norm": 0.09872081130743027, "learning_rate": 4.517123287671233e-05, "loss": 1.6861, "step": 282 }, { "epoch": 0.09689706826449818, "grad_norm": 0.09882810711860657, "learning_rate": 4.51541095890411e-05, "loss": 1.6334, "step": 283 }, { "epoch": 0.0972394607318639, "grad_norm": 0.09081804752349854, "learning_rate": 4.5136986301369864e-05, "loss": 1.7453, "step": 284 }, { "epoch": 0.09758185319922962, "grad_norm": 0.08498898893594742, "learning_rate": 4.511986301369863e-05, "loss": 1.6548, "step": 285 }, { "epoch": 0.09792424566659534, "grad_norm": 0.09270717948675156, "learning_rate": 4.51027397260274e-05, "loss": 1.7379, "step": 286 }, { "epoch": 0.09826663813396105, "grad_norm": 0.09044499695301056, "learning_rate": 4.5085616438356163e-05, "loss": 1.6376, "step": 287 }, { "epoch": 0.09860903060132677, "grad_norm": 0.09623486548662186, "learning_rate": 4.506849315068493e-05, "loss": 1.6735, "step": 288 }, { "epoch": 0.09895142306869249, "grad_norm": 0.08190672844648361, "learning_rate": 4.5051369863013696e-05, "loss": 1.6155, "step": 289 }, { "epoch": 0.0992938155360582, "grad_norm": 0.09620966762304306, "learning_rate": 4.503424657534247e-05, "loss": 1.724, "step": 290 }, { "epoch": 0.09963620800342392, "grad_norm": 0.09808292984962463, "learning_rate": 4.5017123287671236e-05, "loss": 1.7142, "step": 291 }, { "epoch": 0.09997860047078964, "grad_norm": 0.0936235710978508, "learning_rate": 4.5e-05, "loss": 1.5922, "step": 292 }, { "epoch": 0.10032099293815536, "grad_norm": 0.0987422987818718, "learning_rate": 4.498287671232877e-05, "loss": 1.7061, "step": 293 }, { "epoch": 0.10066338540552108, "grad_norm": 0.08598805963993073, "learning_rate": 4.4965753424657536e-05, "loss": 1.6773, "step": 294 }, { "epoch": 0.1010057778728868, "grad_norm": 0.09331586956977844, "learning_rate": 4.49486301369863e-05, "loss": 1.5688, "step": 295 }, { "epoch": 0.10134817034025251, "grad_norm": 0.09061907231807709, "learning_rate": 4.493150684931507e-05, "loss": 1.7069, "step": 296 }, { "epoch": 0.10169056280761823, "grad_norm": 0.09701483696699142, "learning_rate": 4.4914383561643835e-05, "loss": 1.6153, "step": 297 }, { "epoch": 0.10203295527498395, "grad_norm": 0.09092317521572113, "learning_rate": 4.48972602739726e-05, "loss": 1.7165, "step": 298 }, { "epoch": 0.10237534774234967, "grad_norm": 0.08764811605215073, "learning_rate": 4.4880136986301375e-05, "loss": 1.6857, "step": 299 }, { "epoch": 0.10271774020971539, "grad_norm": 0.08957915008068085, "learning_rate": 4.486301369863014e-05, "loss": 1.6159, "step": 300 }, { "epoch": 0.1030601326770811, "grad_norm": 0.09311167150735855, "learning_rate": 4.484589041095891e-05, "loss": 1.5835, "step": 301 }, { "epoch": 0.10340252514444682, "grad_norm": 0.08929101377725601, "learning_rate": 4.4828767123287675e-05, "loss": 1.6434, "step": 302 }, { "epoch": 0.10374491761181254, "grad_norm": 0.09887774288654327, "learning_rate": 4.481164383561644e-05, "loss": 1.4699, "step": 303 }, { "epoch": 0.10408731007917826, "grad_norm": 0.0969981700181961, "learning_rate": 4.479452054794521e-05, "loss": 1.6878, "step": 304 }, { "epoch": 0.10442970254654398, "grad_norm": 0.09757175296545029, "learning_rate": 4.4777397260273974e-05, "loss": 1.7392, "step": 305 }, { "epoch": 0.10477209501390969, "grad_norm": 0.09291095286607742, "learning_rate": 4.476027397260274e-05, "loss": 1.7251, "step": 306 }, { "epoch": 0.10511448748127541, "grad_norm": 0.10909043252468109, "learning_rate": 4.474315068493151e-05, "loss": 1.628, "step": 307 }, { "epoch": 0.10545687994864113, "grad_norm": 0.11149530112743378, "learning_rate": 4.472602739726028e-05, "loss": 1.6754, "step": 308 }, { "epoch": 0.10579927241600685, "grad_norm": 0.101134292781353, "learning_rate": 4.470890410958905e-05, "loss": 1.6537, "step": 309 }, { "epoch": 0.10614166488337257, "grad_norm": 0.09556742012500763, "learning_rate": 4.4691780821917814e-05, "loss": 1.6843, "step": 310 }, { "epoch": 0.10648405735073828, "grad_norm": 0.10969960689544678, "learning_rate": 4.467465753424658e-05, "loss": 1.5014, "step": 311 }, { "epoch": 0.106826449818104, "grad_norm": 0.14613740146160126, "learning_rate": 4.465753424657535e-05, "loss": 1.4896, "step": 312 }, { "epoch": 0.10716884228546972, "grad_norm": 0.09379447251558304, "learning_rate": 4.464041095890411e-05, "loss": 1.6545, "step": 313 }, { "epoch": 0.10751123475283544, "grad_norm": 0.09495531767606735, "learning_rate": 4.462328767123288e-05, "loss": 1.5814, "step": 314 }, { "epoch": 0.10785362722020116, "grad_norm": 0.09832298010587692, "learning_rate": 4.4606164383561646e-05, "loss": 1.6615, "step": 315 }, { "epoch": 0.10819601968756687, "grad_norm": 0.10543192923069, "learning_rate": 4.458904109589041e-05, "loss": 1.6571, "step": 316 }, { "epoch": 0.1085384121549326, "grad_norm": 0.09127697348594666, "learning_rate": 4.457191780821918e-05, "loss": 1.622, "step": 317 }, { "epoch": 0.10888080462229831, "grad_norm": 0.10988138616085052, "learning_rate": 4.4554794520547946e-05, "loss": 1.7241, "step": 318 }, { "epoch": 0.10922319708966403, "grad_norm": 0.0973123237490654, "learning_rate": 4.453767123287671e-05, "loss": 1.7198, "step": 319 }, { "epoch": 0.10956558955702975, "grad_norm": 0.09995874017477036, "learning_rate": 4.452054794520548e-05, "loss": 1.6833, "step": 320 }, { "epoch": 0.10990798202439546, "grad_norm": 0.10260048508644104, "learning_rate": 4.4503424657534246e-05, "loss": 1.7642, "step": 321 }, { "epoch": 0.11025037449176119, "grad_norm": 0.10674654692411423, "learning_rate": 4.448630136986301e-05, "loss": 1.7657, "step": 322 }, { "epoch": 0.1105927669591269, "grad_norm": 0.09955170005559921, "learning_rate": 4.446917808219178e-05, "loss": 1.661, "step": 323 }, { "epoch": 0.11093515942649261, "grad_norm": 0.10323288291692734, "learning_rate": 4.4452054794520545e-05, "loss": 1.6198, "step": 324 }, { "epoch": 0.11127755189385834, "grad_norm": 0.09374504536390305, "learning_rate": 4.443493150684932e-05, "loss": 1.7037, "step": 325 }, { "epoch": 0.11161994436122405, "grad_norm": 0.10133863240480423, "learning_rate": 4.4417808219178085e-05, "loss": 1.7293, "step": 326 }, { "epoch": 0.11196233682858978, "grad_norm": 0.09285712242126465, "learning_rate": 4.440068493150685e-05, "loss": 1.7528, "step": 327 }, { "epoch": 0.11230472929595549, "grad_norm": 0.09209325164556503, "learning_rate": 4.438356164383562e-05, "loss": 1.6546, "step": 328 }, { "epoch": 0.1126471217633212, "grad_norm": 0.09417708963155746, "learning_rate": 4.4366438356164384e-05, "loss": 1.5238, "step": 329 }, { "epoch": 0.11298951423068693, "grad_norm": 0.09967464953660965, "learning_rate": 4.434931506849315e-05, "loss": 1.6428, "step": 330 }, { "epoch": 0.11333190669805264, "grad_norm": 0.09976007044315338, "learning_rate": 4.433219178082192e-05, "loss": 1.6412, "step": 331 }, { "epoch": 0.11367429916541837, "grad_norm": 0.10043544322252274, "learning_rate": 4.4315068493150684e-05, "loss": 1.7054, "step": 332 }, { "epoch": 0.11401669163278408, "grad_norm": 0.10294787585735321, "learning_rate": 4.429794520547945e-05, "loss": 1.6408, "step": 333 }, { "epoch": 0.11435908410014979, "grad_norm": 0.10531903803348541, "learning_rate": 4.4280821917808224e-05, "loss": 1.7062, "step": 334 }, { "epoch": 0.11470147656751552, "grad_norm": 0.09277693927288055, "learning_rate": 4.426369863013699e-05, "loss": 1.5823, "step": 335 }, { "epoch": 0.11504386903488123, "grad_norm": 0.10990926623344421, "learning_rate": 4.424657534246576e-05, "loss": 1.7311, "step": 336 }, { "epoch": 0.11538626150224696, "grad_norm": 0.10486804693937302, "learning_rate": 4.4229452054794523e-05, "loss": 1.6625, "step": 337 }, { "epoch": 0.11572865396961267, "grad_norm": 0.10076644271612167, "learning_rate": 4.421232876712329e-05, "loss": 1.6944, "step": 338 }, { "epoch": 0.11607104643697838, "grad_norm": 0.11350435763597488, "learning_rate": 4.4195205479452056e-05, "loss": 1.7068, "step": 339 }, { "epoch": 0.11641343890434411, "grad_norm": 0.10047660022974014, "learning_rate": 4.417808219178082e-05, "loss": 1.5568, "step": 340 }, { "epoch": 0.11675583137170982, "grad_norm": 0.0972144603729248, "learning_rate": 4.416095890410959e-05, "loss": 1.7985, "step": 341 }, { "epoch": 0.11709822383907555, "grad_norm": 0.1072525754570961, "learning_rate": 4.4143835616438356e-05, "loss": 1.57, "step": 342 }, { "epoch": 0.11744061630644126, "grad_norm": 0.10910877585411072, "learning_rate": 4.412671232876713e-05, "loss": 1.6297, "step": 343 }, { "epoch": 0.11778300877380697, "grad_norm": 0.10190355032682419, "learning_rate": 4.4109589041095896e-05, "loss": 1.6841, "step": 344 }, { "epoch": 0.1181254012411727, "grad_norm": 0.10104071348905563, "learning_rate": 4.409246575342466e-05, "loss": 1.8417, "step": 345 }, { "epoch": 0.11846779370853841, "grad_norm": 0.10030154138803482, "learning_rate": 4.407534246575343e-05, "loss": 1.7021, "step": 346 }, { "epoch": 0.11881018617590414, "grad_norm": 0.10112757235765457, "learning_rate": 4.4058219178082195e-05, "loss": 1.6925, "step": 347 }, { "epoch": 0.11915257864326985, "grad_norm": 0.11009068042039871, "learning_rate": 4.404109589041096e-05, "loss": 1.6566, "step": 348 }, { "epoch": 0.11949497111063556, "grad_norm": 0.11708416789770126, "learning_rate": 4.402397260273973e-05, "loss": 1.6176, "step": 349 }, { "epoch": 0.11983736357800129, "grad_norm": 0.11151184141635895, "learning_rate": 4.4006849315068495e-05, "loss": 1.7221, "step": 350 }, { "epoch": 0.120179756045367, "grad_norm": 0.10328155010938644, "learning_rate": 4.398972602739726e-05, "loss": 1.639, "step": 351 }, { "epoch": 0.12052214851273271, "grad_norm": 0.10615744441747665, "learning_rate": 4.3972602739726035e-05, "loss": 1.6787, "step": 352 }, { "epoch": 0.12086454098009844, "grad_norm": 0.11191114783287048, "learning_rate": 4.39554794520548e-05, "loss": 1.6544, "step": 353 }, { "epoch": 0.12120693344746415, "grad_norm": 0.09757652878761292, "learning_rate": 4.393835616438357e-05, "loss": 1.621, "step": 354 }, { "epoch": 0.12154932591482988, "grad_norm": 0.10832565277814865, "learning_rate": 4.3921232876712334e-05, "loss": 1.6643, "step": 355 }, { "epoch": 0.12189171838219559, "grad_norm": 0.09824807941913605, "learning_rate": 4.39041095890411e-05, "loss": 1.6863, "step": 356 }, { "epoch": 0.1222341108495613, "grad_norm": 0.10504347831010818, "learning_rate": 4.388698630136987e-05, "loss": 1.6213, "step": 357 }, { "epoch": 0.12257650331692703, "grad_norm": 0.1049027293920517, "learning_rate": 4.3869863013698634e-05, "loss": 1.7828, "step": 358 }, { "epoch": 0.12291889578429274, "grad_norm": 0.09898854047060013, "learning_rate": 4.38527397260274e-05, "loss": 1.7378, "step": 359 }, { "epoch": 0.12326128825165847, "grad_norm": 0.10436498373746872, "learning_rate": 4.383561643835617e-05, "loss": 1.6815, "step": 360 }, { "epoch": 0.12360368071902418, "grad_norm": 0.10302499681711197, "learning_rate": 4.3818493150684933e-05, "loss": 1.7473, "step": 361 }, { "epoch": 0.1239460731863899, "grad_norm": 0.10897395014762878, "learning_rate": 4.38013698630137e-05, "loss": 1.6004, "step": 362 }, { "epoch": 0.12428846565375562, "grad_norm": 0.1049656867980957, "learning_rate": 4.3784246575342467e-05, "loss": 1.5981, "step": 363 }, { "epoch": 0.12463085812112133, "grad_norm": 0.09728703647851944, "learning_rate": 4.376712328767123e-05, "loss": 1.7592, "step": 364 }, { "epoch": 0.12497325058848706, "grad_norm": 0.10468621551990509, "learning_rate": 4.375e-05, "loss": 1.6247, "step": 365 }, { "epoch": 0.12531564305585277, "grad_norm": 0.0922587662935257, "learning_rate": 4.3732876712328766e-05, "loss": 1.5586, "step": 366 }, { "epoch": 0.1256580355232185, "grad_norm": 0.10754860192537308, "learning_rate": 4.371575342465753e-05, "loss": 1.7226, "step": 367 }, { "epoch": 0.1260004279905842, "grad_norm": 0.11063621938228607, "learning_rate": 4.36986301369863e-05, "loss": 1.5676, "step": 368 }, { "epoch": 0.12634282045794992, "grad_norm": 0.10455215722322464, "learning_rate": 4.3681506849315066e-05, "loss": 1.6953, "step": 369 }, { "epoch": 0.12668521292531565, "grad_norm": 0.09708420932292938, "learning_rate": 4.366438356164384e-05, "loss": 1.6613, "step": 370 }, { "epoch": 0.12702760539268135, "grad_norm": 0.10349924117326736, "learning_rate": 4.3647260273972605e-05, "loss": 1.6758, "step": 371 }, { "epoch": 0.12736999786004707, "grad_norm": 0.10797405242919922, "learning_rate": 4.363013698630137e-05, "loss": 1.7036, "step": 372 }, { "epoch": 0.1277123903274128, "grad_norm": 0.10180880129337311, "learning_rate": 4.361301369863014e-05, "loss": 1.632, "step": 373 }, { "epoch": 0.12805478279477853, "grad_norm": 0.10134158283472061, "learning_rate": 4.3595890410958905e-05, "loss": 1.6797, "step": 374 }, { "epoch": 0.12839717526214423, "grad_norm": 0.102899469435215, "learning_rate": 4.357876712328767e-05, "loss": 1.696, "step": 375 }, { "epoch": 0.12873956772950995, "grad_norm": 0.10969360917806625, "learning_rate": 4.356164383561644e-05, "loss": 1.6904, "step": 376 }, { "epoch": 0.12908196019687568, "grad_norm": 0.10944566130638123, "learning_rate": 4.3544520547945205e-05, "loss": 1.5783, "step": 377 }, { "epoch": 0.12942435266424138, "grad_norm": 0.10464665293693542, "learning_rate": 4.352739726027397e-05, "loss": 1.6811, "step": 378 }, { "epoch": 0.1297667451316071, "grad_norm": 0.11274212598800659, "learning_rate": 4.3510273972602744e-05, "loss": 1.6716, "step": 379 }, { "epoch": 0.13010913759897283, "grad_norm": 0.1061227023601532, "learning_rate": 4.349315068493151e-05, "loss": 1.5883, "step": 380 }, { "epoch": 0.13045153006633853, "grad_norm": 0.1062810942530632, "learning_rate": 4.347602739726028e-05, "loss": 1.6663, "step": 381 }, { "epoch": 0.13079392253370425, "grad_norm": 0.09908968955278397, "learning_rate": 4.3458904109589044e-05, "loss": 1.6649, "step": 382 }, { "epoch": 0.13113631500106998, "grad_norm": 0.10394290834665298, "learning_rate": 4.344178082191781e-05, "loss": 1.6229, "step": 383 }, { "epoch": 0.1314787074684357, "grad_norm": 0.10818452388048172, "learning_rate": 4.342465753424658e-05, "loss": 1.6679, "step": 384 }, { "epoch": 0.1318210999358014, "grad_norm": 0.10522853583097458, "learning_rate": 4.3407534246575344e-05, "loss": 1.6792, "step": 385 }, { "epoch": 0.13216349240316713, "grad_norm": 0.10971163213253021, "learning_rate": 4.339041095890411e-05, "loss": 1.5863, "step": 386 }, { "epoch": 0.13250588487053286, "grad_norm": 0.15135875344276428, "learning_rate": 4.3373287671232877e-05, "loss": 1.7192, "step": 387 }, { "epoch": 0.13284827733789856, "grad_norm": 0.12090080976486206, "learning_rate": 4.335616438356165e-05, "loss": 1.6904, "step": 388 }, { "epoch": 0.13319066980526428, "grad_norm": 0.10476568341255188, "learning_rate": 4.3339041095890416e-05, "loss": 1.6107, "step": 389 }, { "epoch": 0.13353306227263, "grad_norm": 0.10649127513170242, "learning_rate": 4.332191780821918e-05, "loss": 1.6727, "step": 390 }, { "epoch": 0.1338754547399957, "grad_norm": 0.11348778009414673, "learning_rate": 4.330479452054795e-05, "loss": 1.5948, "step": 391 }, { "epoch": 0.13421784720736143, "grad_norm": 0.10558129847049713, "learning_rate": 4.3287671232876716e-05, "loss": 1.6016, "step": 392 }, { "epoch": 0.13456023967472716, "grad_norm": 0.10092508047819138, "learning_rate": 4.327054794520548e-05, "loss": 1.5849, "step": 393 }, { "epoch": 0.1349026321420929, "grad_norm": 0.10230904817581177, "learning_rate": 4.325342465753425e-05, "loss": 1.6677, "step": 394 }, { "epoch": 0.1352450246094586, "grad_norm": 0.1074233204126358, "learning_rate": 4.3236301369863016e-05, "loss": 1.705, "step": 395 }, { "epoch": 0.1355874170768243, "grad_norm": 0.10966385155916214, "learning_rate": 4.321917808219178e-05, "loss": 1.5258, "step": 396 }, { "epoch": 0.13592980954419004, "grad_norm": 0.1151796206831932, "learning_rate": 4.3202054794520555e-05, "loss": 1.6916, "step": 397 }, { "epoch": 0.13627220201155574, "grad_norm": 0.12243807315826416, "learning_rate": 4.318493150684932e-05, "loss": 1.6288, "step": 398 }, { "epoch": 0.13661459447892146, "grad_norm": 0.1118185892701149, "learning_rate": 4.316780821917809e-05, "loss": 1.739, "step": 399 }, { "epoch": 0.1369569869462872, "grad_norm": 0.11389683187007904, "learning_rate": 4.3150684931506855e-05, "loss": 1.6698, "step": 400 }, { "epoch": 0.1372993794136529, "grad_norm": 0.10690728574991226, "learning_rate": 4.313356164383562e-05, "loss": 1.6937, "step": 401 }, { "epoch": 0.13764177188101862, "grad_norm": 0.10303880274295807, "learning_rate": 4.311643835616439e-05, "loss": 1.6637, "step": 402 }, { "epoch": 0.13798416434838434, "grad_norm": 0.11129255592823029, "learning_rate": 4.3099315068493154e-05, "loss": 1.6904, "step": 403 }, { "epoch": 0.13832655681575004, "grad_norm": 0.1031140685081482, "learning_rate": 4.308219178082192e-05, "loss": 1.6538, "step": 404 }, { "epoch": 0.13866894928311577, "grad_norm": 0.10953545570373535, "learning_rate": 4.306506849315069e-05, "loss": 1.5768, "step": 405 }, { "epoch": 0.1390113417504815, "grad_norm": 0.10498495399951935, "learning_rate": 4.3047945205479454e-05, "loss": 1.604, "step": 406 }, { "epoch": 0.13935373421784722, "grad_norm": 0.10904544591903687, "learning_rate": 4.303082191780822e-05, "loss": 1.7042, "step": 407 }, { "epoch": 0.13969612668521292, "grad_norm": 0.10780541598796844, "learning_rate": 4.301369863013699e-05, "loss": 1.6117, "step": 408 }, { "epoch": 0.14003851915257864, "grad_norm": 0.13554547727108002, "learning_rate": 4.2996575342465754e-05, "loss": 1.5949, "step": 409 }, { "epoch": 0.14038091161994437, "grad_norm": 0.15092478692531586, "learning_rate": 4.297945205479452e-05, "loss": 1.5362, "step": 410 }, { "epoch": 0.14072330408731007, "grad_norm": 0.1034151166677475, "learning_rate": 4.296232876712329e-05, "loss": 1.5933, "step": 411 }, { "epoch": 0.1410656965546758, "grad_norm": 0.12007016688585281, "learning_rate": 4.294520547945205e-05, "loss": 1.6098, "step": 412 }, { "epoch": 0.14140808902204152, "grad_norm": 0.10357314348220825, "learning_rate": 4.292808219178082e-05, "loss": 1.6359, "step": 413 }, { "epoch": 0.14175048148940722, "grad_norm": 0.11694418638944626, "learning_rate": 4.291095890410959e-05, "loss": 1.6284, "step": 414 }, { "epoch": 0.14209287395677295, "grad_norm": 0.11604868620634079, "learning_rate": 4.289383561643836e-05, "loss": 1.5684, "step": 415 }, { "epoch": 0.14243526642413867, "grad_norm": 0.11260902136564255, "learning_rate": 4.2876712328767126e-05, "loss": 1.6057, "step": 416 }, { "epoch": 0.1427776588915044, "grad_norm": 0.10319408029317856, "learning_rate": 4.285958904109589e-05, "loss": 1.6988, "step": 417 }, { "epoch": 0.1431200513588701, "grad_norm": 0.1226063221693039, "learning_rate": 4.284246575342466e-05, "loss": 1.6787, "step": 418 }, { "epoch": 0.14346244382623582, "grad_norm": 0.12031389772891998, "learning_rate": 4.2825342465753426e-05, "loss": 1.7506, "step": 419 }, { "epoch": 0.14380483629360155, "grad_norm": 0.11423229426145554, "learning_rate": 4.280821917808219e-05, "loss": 1.5643, "step": 420 }, { "epoch": 0.14414722876096725, "grad_norm": 0.11349527537822723, "learning_rate": 4.279109589041096e-05, "loss": 1.7452, "step": 421 }, { "epoch": 0.14448962122833298, "grad_norm": 0.1065075471997261, "learning_rate": 4.2773972602739725e-05, "loss": 1.538, "step": 422 }, { "epoch": 0.1448320136956987, "grad_norm": 0.11257865279912949, "learning_rate": 4.27568493150685e-05, "loss": 1.6, "step": 423 }, { "epoch": 0.1451744061630644, "grad_norm": 0.10548405349254608, "learning_rate": 4.2739726027397265e-05, "loss": 1.6888, "step": 424 }, { "epoch": 0.14551679863043013, "grad_norm": 0.11802598834037781, "learning_rate": 4.272260273972603e-05, "loss": 1.6167, "step": 425 }, { "epoch": 0.14585919109779585, "grad_norm": 0.11242114752531052, "learning_rate": 4.27054794520548e-05, "loss": 1.6771, "step": 426 }, { "epoch": 0.14620158356516158, "grad_norm": 0.11344623565673828, "learning_rate": 4.2688356164383565e-05, "loss": 1.6544, "step": 427 }, { "epoch": 0.14654397603252728, "grad_norm": 0.13153693079948425, "learning_rate": 4.267123287671233e-05, "loss": 1.6525, "step": 428 }, { "epoch": 0.146886368499893, "grad_norm": 0.10986490547657013, "learning_rate": 4.26541095890411e-05, "loss": 1.67, "step": 429 }, { "epoch": 0.14722876096725873, "grad_norm": 0.11086393147706985, "learning_rate": 4.2636986301369864e-05, "loss": 1.6242, "step": 430 }, { "epoch": 0.14757115343462443, "grad_norm": 0.12074118107557297, "learning_rate": 4.261986301369863e-05, "loss": 1.6376, "step": 431 }, { "epoch": 0.14791354590199016, "grad_norm": 0.11654341220855713, "learning_rate": 4.2602739726027404e-05, "loss": 1.5881, "step": 432 }, { "epoch": 0.14825593836935588, "grad_norm": 0.11425840854644775, "learning_rate": 4.258561643835617e-05, "loss": 1.6048, "step": 433 }, { "epoch": 0.14859833083672158, "grad_norm": 0.12574529647827148, "learning_rate": 4.256849315068494e-05, "loss": 1.6659, "step": 434 }, { "epoch": 0.1489407233040873, "grad_norm": 0.11223893612623215, "learning_rate": 4.2551369863013703e-05, "loss": 1.6767, "step": 435 }, { "epoch": 0.14928311577145303, "grad_norm": 0.12389078736305237, "learning_rate": 4.253424657534247e-05, "loss": 1.7583, "step": 436 }, { "epoch": 0.14962550823881873, "grad_norm": 0.11372170597314835, "learning_rate": 4.2517123287671237e-05, "loss": 1.5835, "step": 437 }, { "epoch": 0.14996790070618446, "grad_norm": 0.12239832431077957, "learning_rate": 4.25e-05, "loss": 1.5392, "step": 438 }, { "epoch": 0.15031029317355019, "grad_norm": 0.1267910897731781, "learning_rate": 4.248287671232877e-05, "loss": 1.7566, "step": 439 }, { "epoch": 0.1506526856409159, "grad_norm": 0.12650780379772186, "learning_rate": 4.2465753424657536e-05, "loss": 1.6374, "step": 440 }, { "epoch": 0.1509950781082816, "grad_norm": 0.11330632120370865, "learning_rate": 4.24486301369863e-05, "loss": 1.738, "step": 441 }, { "epoch": 0.15133747057564734, "grad_norm": 0.11088535934686661, "learning_rate": 4.243150684931507e-05, "loss": 1.6668, "step": 442 }, { "epoch": 0.15167986304301306, "grad_norm": 0.12631316483020782, "learning_rate": 4.2414383561643836e-05, "loss": 1.6517, "step": 443 }, { "epoch": 0.15202225551037876, "grad_norm": 0.12414132803678513, "learning_rate": 4.23972602739726e-05, "loss": 1.5884, "step": 444 }, { "epoch": 0.1523646479777445, "grad_norm": 0.1275978833436966, "learning_rate": 4.238013698630137e-05, "loss": 1.7802, "step": 445 }, { "epoch": 0.15270704044511021, "grad_norm": 0.11694102734327316, "learning_rate": 4.2363013698630135e-05, "loss": 1.7453, "step": 446 }, { "epoch": 0.1530494329124759, "grad_norm": 0.10930205136537552, "learning_rate": 4.23458904109589e-05, "loss": 1.6781, "step": 447 }, { "epoch": 0.15339182537984164, "grad_norm": 0.11589685827493668, "learning_rate": 4.232876712328767e-05, "loss": 1.7686, "step": 448 }, { "epoch": 0.15373421784720737, "grad_norm": 0.11823911219835281, "learning_rate": 4.2311643835616435e-05, "loss": 1.7531, "step": 449 }, { "epoch": 0.1540766103145731, "grad_norm": 0.11519394814968109, "learning_rate": 4.229452054794521e-05, "loss": 1.7267, "step": 450 }, { "epoch": 0.1544190027819388, "grad_norm": 0.11968246847391129, "learning_rate": 4.2277397260273975e-05, "loss": 1.6074, "step": 451 }, { "epoch": 0.15476139524930452, "grad_norm": 0.12060266733169556, "learning_rate": 4.226027397260274e-05, "loss": 1.6923, "step": 452 }, { "epoch": 0.15510378771667024, "grad_norm": 0.12134602665901184, "learning_rate": 4.224315068493151e-05, "loss": 1.6274, "step": 453 }, { "epoch": 0.15544618018403594, "grad_norm": 0.11395867168903351, "learning_rate": 4.2226027397260274e-05, "loss": 1.691, "step": 454 }, { "epoch": 0.15578857265140167, "grad_norm": 0.12005753070116043, "learning_rate": 4.220890410958904e-05, "loss": 1.5483, "step": 455 }, { "epoch": 0.1561309651187674, "grad_norm": 0.10603690892457962, "learning_rate": 4.219178082191781e-05, "loss": 1.6424, "step": 456 }, { "epoch": 0.1564733575861331, "grad_norm": 0.13582277297973633, "learning_rate": 4.2174657534246574e-05, "loss": 1.6439, "step": 457 }, { "epoch": 0.15681575005349882, "grad_norm": 0.1357034146785736, "learning_rate": 4.215753424657534e-05, "loss": 1.634, "step": 458 }, { "epoch": 0.15715814252086455, "grad_norm": 0.12186066061258316, "learning_rate": 4.2140410958904114e-05, "loss": 1.5968, "step": 459 }, { "epoch": 0.15750053498823025, "grad_norm": 0.11769383400678635, "learning_rate": 4.212328767123288e-05, "loss": 1.6576, "step": 460 }, { "epoch": 0.15784292745559597, "grad_norm": 0.11436070501804352, "learning_rate": 4.2106164383561647e-05, "loss": 1.595, "step": 461 }, { "epoch": 0.1581853199229617, "grad_norm": 0.1281818449497223, "learning_rate": 4.208904109589041e-05, "loss": 1.6109, "step": 462 }, { "epoch": 0.15852771239032742, "grad_norm": 0.11414868384599686, "learning_rate": 4.207191780821918e-05, "loss": 1.6975, "step": 463 }, { "epoch": 0.15887010485769312, "grad_norm": 0.12027088552713394, "learning_rate": 4.2054794520547946e-05, "loss": 1.6413, "step": 464 }, { "epoch": 0.15921249732505885, "grad_norm": 0.1192057877779007, "learning_rate": 4.203767123287671e-05, "loss": 1.6728, "step": 465 }, { "epoch": 0.15955488979242458, "grad_norm": 0.12998990714550018, "learning_rate": 4.202054794520548e-05, "loss": 1.6857, "step": 466 }, { "epoch": 0.15989728225979027, "grad_norm": 0.1170077845454216, "learning_rate": 4.2003424657534246e-05, "loss": 1.5877, "step": 467 }, { "epoch": 0.160239674727156, "grad_norm": 0.12133894860744476, "learning_rate": 4.198630136986302e-05, "loss": 1.6075, "step": 468 }, { "epoch": 0.16058206719452173, "grad_norm": 0.11731059104204178, "learning_rate": 4.1969178082191786e-05, "loss": 1.5889, "step": 469 }, { "epoch": 0.16092445966188743, "grad_norm": 0.1105027124285698, "learning_rate": 4.195205479452055e-05, "loss": 1.6407, "step": 470 }, { "epoch": 0.16126685212925315, "grad_norm": 0.11220131069421768, "learning_rate": 4.193493150684932e-05, "loss": 1.708, "step": 471 }, { "epoch": 0.16160924459661888, "grad_norm": 0.12284894287586212, "learning_rate": 4.1917808219178085e-05, "loss": 1.6663, "step": 472 }, { "epoch": 0.1619516370639846, "grad_norm": 0.11857478320598602, "learning_rate": 4.190068493150685e-05, "loss": 1.698, "step": 473 }, { "epoch": 0.1622940295313503, "grad_norm": 0.12169017642736435, "learning_rate": 4.188356164383562e-05, "loss": 1.6881, "step": 474 }, { "epoch": 0.16263642199871603, "grad_norm": 0.1189318373799324, "learning_rate": 4.1866438356164385e-05, "loss": 1.6196, "step": 475 }, { "epoch": 0.16297881446608176, "grad_norm": 0.12028927356004715, "learning_rate": 4.184931506849315e-05, "loss": 1.6017, "step": 476 }, { "epoch": 0.16332120693344745, "grad_norm": 0.12154337763786316, "learning_rate": 4.1832191780821924e-05, "loss": 1.6525, "step": 477 }, { "epoch": 0.16366359940081318, "grad_norm": 0.12698642909526825, "learning_rate": 4.181506849315069e-05, "loss": 1.6786, "step": 478 }, { "epoch": 0.1640059918681789, "grad_norm": 0.12965944409370422, "learning_rate": 4.179794520547946e-05, "loss": 1.7023, "step": 479 }, { "epoch": 0.1643483843355446, "grad_norm": 0.12271573394536972, "learning_rate": 4.1780821917808224e-05, "loss": 1.656, "step": 480 }, { "epoch": 0.16469077680291033, "grad_norm": 0.1356169879436493, "learning_rate": 4.176369863013699e-05, "loss": 1.6796, "step": 481 }, { "epoch": 0.16503316927027606, "grad_norm": 0.12161485105752945, "learning_rate": 4.174657534246576e-05, "loss": 1.6372, "step": 482 }, { "epoch": 0.16537556173764179, "grad_norm": 0.11599437147378922, "learning_rate": 4.1729452054794524e-05, "loss": 1.5952, "step": 483 }, { "epoch": 0.16571795420500748, "grad_norm": 0.12786725163459778, "learning_rate": 4.171232876712329e-05, "loss": 1.6796, "step": 484 }, { "epoch": 0.1660603466723732, "grad_norm": 0.12315534800291061, "learning_rate": 4.169520547945206e-05, "loss": 1.6211, "step": 485 }, { "epoch": 0.16640273913973894, "grad_norm": 0.12146754562854767, "learning_rate": 4.167808219178082e-05, "loss": 1.6835, "step": 486 }, { "epoch": 0.16674513160710464, "grad_norm": 0.12494952976703644, "learning_rate": 4.166095890410959e-05, "loss": 1.6258, "step": 487 }, { "epoch": 0.16708752407447036, "grad_norm": 0.12505009770393372, "learning_rate": 4.1643835616438356e-05, "loss": 1.6797, "step": 488 }, { "epoch": 0.1674299165418361, "grad_norm": 0.14558471739292145, "learning_rate": 4.162671232876712e-05, "loss": 1.5939, "step": 489 }, { "epoch": 0.1677723090092018, "grad_norm": 0.1313253492116928, "learning_rate": 4.160958904109589e-05, "loss": 1.7185, "step": 490 }, { "epoch": 0.1681147014765675, "grad_norm": 0.13647672533988953, "learning_rate": 4.1592465753424656e-05, "loss": 1.6914, "step": 491 }, { "epoch": 0.16845709394393324, "grad_norm": 0.11163683980703354, "learning_rate": 4.157534246575342e-05, "loss": 1.6327, "step": 492 }, { "epoch": 0.16879948641129894, "grad_norm": 0.13047580420970917, "learning_rate": 4.155821917808219e-05, "loss": 1.7416, "step": 493 }, { "epoch": 0.16914187887866466, "grad_norm": 0.13600842654705048, "learning_rate": 4.1541095890410955e-05, "loss": 1.5234, "step": 494 }, { "epoch": 0.1694842713460304, "grad_norm": 0.12226990610361099, "learning_rate": 4.152397260273973e-05, "loss": 1.5891, "step": 495 }, { "epoch": 0.16982666381339612, "grad_norm": 0.12530113756656647, "learning_rate": 4.1506849315068495e-05, "loss": 1.7131, "step": 496 }, { "epoch": 0.17016905628076182, "grad_norm": 0.1127689853310585, "learning_rate": 4.148972602739726e-05, "loss": 1.5939, "step": 497 }, { "epoch": 0.17051144874812754, "grad_norm": 0.12565338611602783, "learning_rate": 4.147260273972603e-05, "loss": 1.7274, "step": 498 }, { "epoch": 0.17085384121549327, "grad_norm": 0.12755802273750305, "learning_rate": 4.1455479452054795e-05, "loss": 1.7266, "step": 499 }, { "epoch": 0.17119623368285897, "grad_norm": 0.12197522073984146, "learning_rate": 4.143835616438356e-05, "loss": 1.676, "step": 500 }, { "epoch": 0.1715386261502247, "grad_norm": 0.12871427834033966, "learning_rate": 4.142123287671233e-05, "loss": 1.6593, "step": 501 }, { "epoch": 0.17188101861759042, "grad_norm": 0.11776921898126602, "learning_rate": 4.1404109589041094e-05, "loss": 1.6121, "step": 502 }, { "epoch": 0.17222341108495612, "grad_norm": 0.12235446274280548, "learning_rate": 4.138698630136987e-05, "loss": 1.6445, "step": 503 }, { "epoch": 0.17256580355232184, "grad_norm": 0.11934609711170197, "learning_rate": 4.1369863013698634e-05, "loss": 1.7216, "step": 504 }, { "epoch": 0.17290819601968757, "grad_norm": 0.11325541883707047, "learning_rate": 4.13527397260274e-05, "loss": 1.5689, "step": 505 }, { "epoch": 0.1732505884870533, "grad_norm": 0.1287766546010971, "learning_rate": 4.133561643835617e-05, "loss": 1.6556, "step": 506 }, { "epoch": 0.173592980954419, "grad_norm": 0.1280137002468109, "learning_rate": 4.1318493150684934e-05, "loss": 1.6554, "step": 507 }, { "epoch": 0.17393537342178472, "grad_norm": 0.12991103529930115, "learning_rate": 4.13013698630137e-05, "loss": 1.6195, "step": 508 }, { "epoch": 0.17427776588915045, "grad_norm": 0.11409937590360641, "learning_rate": 4.128424657534247e-05, "loss": 1.5787, "step": 509 }, { "epoch": 0.17462015835651615, "grad_norm": 0.15812428295612335, "learning_rate": 4.126712328767123e-05, "loss": 1.6183, "step": 510 }, { "epoch": 0.17496255082388187, "grad_norm": 0.12198574095964432, "learning_rate": 4.125e-05, "loss": 1.6416, "step": 511 }, { "epoch": 0.1753049432912476, "grad_norm": 0.11933381855487823, "learning_rate": 4.123287671232877e-05, "loss": 1.6627, "step": 512 }, { "epoch": 0.1756473357586133, "grad_norm": 0.1381857693195343, "learning_rate": 4.121575342465754e-05, "loss": 1.6012, "step": 513 }, { "epoch": 0.17598972822597903, "grad_norm": 0.12333427369594574, "learning_rate": 4.1198630136986306e-05, "loss": 1.5806, "step": 514 }, { "epoch": 0.17633212069334475, "grad_norm": 0.12533985078334808, "learning_rate": 4.118150684931507e-05, "loss": 1.6445, "step": 515 }, { "epoch": 0.17667451316071048, "grad_norm": 0.12503953278064728, "learning_rate": 4.116438356164384e-05, "loss": 1.7054, "step": 516 }, { "epoch": 0.17701690562807618, "grad_norm": 0.1376829296350479, "learning_rate": 4.1147260273972606e-05, "loss": 1.6786, "step": 517 }, { "epoch": 0.1773592980954419, "grad_norm": 0.12607446312904358, "learning_rate": 4.113013698630137e-05, "loss": 1.6247, "step": 518 }, { "epoch": 0.17770169056280763, "grad_norm": 0.15627048909664154, "learning_rate": 4.111301369863014e-05, "loss": 1.6399, "step": 519 }, { "epoch": 0.17804408303017333, "grad_norm": 0.13049696385860443, "learning_rate": 4.1095890410958905e-05, "loss": 1.6868, "step": 520 }, { "epoch": 0.17838647549753905, "grad_norm": 0.13925957679748535, "learning_rate": 4.107876712328768e-05, "loss": 1.5989, "step": 521 }, { "epoch": 0.17872886796490478, "grad_norm": 0.1206422969698906, "learning_rate": 4.1061643835616445e-05, "loss": 1.6907, "step": 522 }, { "epoch": 0.17907126043227048, "grad_norm": 0.11545757204294205, "learning_rate": 4.104452054794521e-05, "loss": 1.6121, "step": 523 }, { "epoch": 0.1794136528996362, "grad_norm": 0.1310501992702484, "learning_rate": 4.102739726027398e-05, "loss": 1.7023, "step": 524 }, { "epoch": 0.17975604536700193, "grad_norm": 0.12357061356306076, "learning_rate": 4.1010273972602745e-05, "loss": 1.6156, "step": 525 }, { "epoch": 0.18009843783436763, "grad_norm": 0.12367555499076843, "learning_rate": 4.099315068493151e-05, "loss": 1.6666, "step": 526 }, { "epoch": 0.18044083030173336, "grad_norm": 0.13149291276931763, "learning_rate": 4.097602739726028e-05, "loss": 1.7211, "step": 527 }, { "epoch": 0.18078322276909908, "grad_norm": 0.12431266158819199, "learning_rate": 4.0958904109589044e-05, "loss": 1.6616, "step": 528 }, { "epoch": 0.1811256152364648, "grad_norm": 0.13616812229156494, "learning_rate": 4.094178082191781e-05, "loss": 1.6797, "step": 529 }, { "epoch": 0.1814680077038305, "grad_norm": 0.12090042233467102, "learning_rate": 4.092465753424658e-05, "loss": 1.6092, "step": 530 }, { "epoch": 0.18181040017119623, "grad_norm": 0.1192479208111763, "learning_rate": 4.0907534246575344e-05, "loss": 1.5902, "step": 531 }, { "epoch": 0.18215279263856196, "grad_norm": 0.13946449756622314, "learning_rate": 4.089041095890411e-05, "loss": 1.518, "step": 532 }, { "epoch": 0.18249518510592766, "grad_norm": 0.14199142158031464, "learning_rate": 4.087328767123288e-05, "loss": 1.6591, "step": 533 }, { "epoch": 0.18283757757329339, "grad_norm": 0.13425928354263306, "learning_rate": 4.085616438356164e-05, "loss": 1.634, "step": 534 }, { "epoch": 0.1831799700406591, "grad_norm": 0.11949741095304489, "learning_rate": 4.083904109589041e-05, "loss": 1.5832, "step": 535 }, { "epoch": 0.1835223625080248, "grad_norm": 0.14209719002246857, "learning_rate": 4.0821917808219176e-05, "loss": 1.7324, "step": 536 }, { "epoch": 0.18386475497539054, "grad_norm": 0.13663068413734436, "learning_rate": 4.080479452054794e-05, "loss": 1.7133, "step": 537 }, { "epoch": 0.18420714744275626, "grad_norm": 0.1337069272994995, "learning_rate": 4.078767123287671e-05, "loss": 1.7665, "step": 538 }, { "epoch": 0.184549539910122, "grad_norm": 0.1307343691587448, "learning_rate": 4.077054794520548e-05, "loss": 1.6811, "step": 539 }, { "epoch": 0.1848919323774877, "grad_norm": 0.1344364434480667, "learning_rate": 4.075342465753425e-05, "loss": 1.5844, "step": 540 }, { "epoch": 0.18523432484485342, "grad_norm": 0.12437646836042404, "learning_rate": 4.0736301369863016e-05, "loss": 1.6371, "step": 541 }, { "epoch": 0.18557671731221914, "grad_norm": 0.16046512126922607, "learning_rate": 4.071917808219178e-05, "loss": 1.6998, "step": 542 }, { "epoch": 0.18591910977958484, "grad_norm": 0.1317240446805954, "learning_rate": 4.070205479452055e-05, "loss": 1.5685, "step": 543 }, { "epoch": 0.18626150224695057, "grad_norm": 0.12926693260669708, "learning_rate": 4.0684931506849315e-05, "loss": 1.6655, "step": 544 }, { "epoch": 0.1866038947143163, "grad_norm": 0.14162229001522064, "learning_rate": 4.066780821917808e-05, "loss": 1.5275, "step": 545 }, { "epoch": 0.186946287181682, "grad_norm": 0.13534238934516907, "learning_rate": 4.065068493150685e-05, "loss": 1.7133, "step": 546 }, { "epoch": 0.18728867964904772, "grad_norm": 0.14394383132457733, "learning_rate": 4.0633561643835615e-05, "loss": 1.6163, "step": 547 }, { "epoch": 0.18763107211641344, "grad_norm": 0.12943361699581146, "learning_rate": 4.061643835616439e-05, "loss": 1.6487, "step": 548 }, { "epoch": 0.18797346458377917, "grad_norm": 0.13614319264888763, "learning_rate": 4.0599315068493155e-05, "loss": 1.6525, "step": 549 }, { "epoch": 0.18831585705114487, "grad_norm": 0.15077617764472961, "learning_rate": 4.058219178082192e-05, "loss": 1.7011, "step": 550 }, { "epoch": 0.1886582495185106, "grad_norm": 0.1331607699394226, "learning_rate": 4.056506849315069e-05, "loss": 1.6094, "step": 551 }, { "epoch": 0.18900064198587632, "grad_norm": 0.1333204209804535, "learning_rate": 4.0547945205479454e-05, "loss": 1.6267, "step": 552 }, { "epoch": 0.18934303445324202, "grad_norm": 0.13235726952552795, "learning_rate": 4.053082191780822e-05, "loss": 1.6014, "step": 553 }, { "epoch": 0.18968542692060775, "grad_norm": 0.14035972952842712, "learning_rate": 4.051369863013699e-05, "loss": 1.6747, "step": 554 }, { "epoch": 0.19002781938797347, "grad_norm": 0.1259671151638031, "learning_rate": 4.0496575342465754e-05, "loss": 1.6724, "step": 555 }, { "epoch": 0.19037021185533917, "grad_norm": 0.12770868837833405, "learning_rate": 4.047945205479452e-05, "loss": 1.5782, "step": 556 }, { "epoch": 0.1907126043227049, "grad_norm": 0.14274688065052032, "learning_rate": 4.0462328767123294e-05, "loss": 1.6718, "step": 557 }, { "epoch": 0.19105499679007062, "grad_norm": 0.1180630698800087, "learning_rate": 4.044520547945206e-05, "loss": 1.6194, "step": 558 }, { "epoch": 0.19139738925743632, "grad_norm": 0.15462517738342285, "learning_rate": 4.042808219178083e-05, "loss": 1.6363, "step": 559 }, { "epoch": 0.19173978172480205, "grad_norm": 0.12258777022361755, "learning_rate": 4.041095890410959e-05, "loss": 1.585, "step": 560 }, { "epoch": 0.19208217419216778, "grad_norm": 0.1447216123342514, "learning_rate": 4.039383561643836e-05, "loss": 1.616, "step": 561 }, { "epoch": 0.1924245666595335, "grad_norm": 0.1227114275097847, "learning_rate": 4.0376712328767126e-05, "loss": 1.6191, "step": 562 }, { "epoch": 0.1927669591268992, "grad_norm": 0.13037750124931335, "learning_rate": 4.035958904109589e-05, "loss": 1.6547, "step": 563 }, { "epoch": 0.19310935159426493, "grad_norm": 0.2076314389705658, "learning_rate": 4.034246575342466e-05, "loss": 1.5815, "step": 564 }, { "epoch": 0.19345174406163065, "grad_norm": 0.1525336354970932, "learning_rate": 4.0325342465753426e-05, "loss": 1.6978, "step": 565 }, { "epoch": 0.19379413652899635, "grad_norm": 0.14082792401313782, "learning_rate": 4.03082191780822e-05, "loss": 1.7093, "step": 566 }, { "epoch": 0.19413652899636208, "grad_norm": 0.13535112142562866, "learning_rate": 4.0291095890410966e-05, "loss": 1.5891, "step": 567 }, { "epoch": 0.1944789214637278, "grad_norm": 0.1305035650730133, "learning_rate": 4.027397260273973e-05, "loss": 1.6022, "step": 568 }, { "epoch": 0.1948213139310935, "grad_norm": 0.12984801828861237, "learning_rate": 4.02568493150685e-05, "loss": 1.7125, "step": 569 }, { "epoch": 0.19516370639845923, "grad_norm": 0.1382848173379898, "learning_rate": 4.0239726027397265e-05, "loss": 1.6791, "step": 570 }, { "epoch": 0.19550609886582496, "grad_norm": 0.13096050918102264, "learning_rate": 4.022260273972603e-05, "loss": 1.6253, "step": 571 }, { "epoch": 0.19584849133319068, "grad_norm": 0.13409177958965302, "learning_rate": 4.02054794520548e-05, "loss": 1.5221, "step": 572 }, { "epoch": 0.19619088380055638, "grad_norm": 0.1415422111749649, "learning_rate": 4.0188356164383565e-05, "loss": 1.6154, "step": 573 }, { "epoch": 0.1965332762679221, "grad_norm": 0.1385459005832672, "learning_rate": 4.017123287671233e-05, "loss": 1.7158, "step": 574 }, { "epoch": 0.19687566873528783, "grad_norm": 0.12079416215419769, "learning_rate": 4.01541095890411e-05, "loss": 1.5792, "step": 575 }, { "epoch": 0.19721806120265353, "grad_norm": 0.12709353864192963, "learning_rate": 4.0136986301369864e-05, "loss": 1.6622, "step": 576 }, { "epoch": 0.19756045367001926, "grad_norm": 0.1321648508310318, "learning_rate": 4.011986301369863e-05, "loss": 1.6408, "step": 577 }, { "epoch": 0.19790284613738499, "grad_norm": 0.13979966938495636, "learning_rate": 4.01027397260274e-05, "loss": 1.6165, "step": 578 }, { "epoch": 0.19824523860475068, "grad_norm": 0.1429782509803772, "learning_rate": 4.0085616438356164e-05, "loss": 1.5437, "step": 579 }, { "epoch": 0.1985876310721164, "grad_norm": 0.13867220282554626, "learning_rate": 4.006849315068493e-05, "loss": 1.5603, "step": 580 }, { "epoch": 0.19893002353948214, "grad_norm": 0.14048145711421967, "learning_rate": 4.00513698630137e-05, "loss": 1.6228, "step": 581 }, { "epoch": 0.19927241600684784, "grad_norm": 0.1252722144126892, "learning_rate": 4.0034246575342463e-05, "loss": 1.6142, "step": 582 }, { "epoch": 0.19961480847421356, "grad_norm": 0.1296096295118332, "learning_rate": 4.001712328767123e-05, "loss": 1.5412, "step": 583 }, { "epoch": 0.1999572009415793, "grad_norm": 0.1186581701040268, "learning_rate": 4e-05, "loss": 1.6168, "step": 584 }, { "epoch": 0.20029959340894501, "grad_norm": 0.1619274765253067, "learning_rate": 3.998287671232877e-05, "loss": 1.5272, "step": 585 }, { "epoch": 0.2006419858763107, "grad_norm": 0.14473992586135864, "learning_rate": 3.9965753424657536e-05, "loss": 1.7162, "step": 586 }, { "epoch": 0.20098437834367644, "grad_norm": 0.18789300322532654, "learning_rate": 3.99486301369863e-05, "loss": 1.498, "step": 587 }, { "epoch": 0.20132677081104217, "grad_norm": 0.13118314743041992, "learning_rate": 3.993150684931507e-05, "loss": 1.6164, "step": 588 }, { "epoch": 0.20166916327840786, "grad_norm": 0.12488346546888351, "learning_rate": 3.9914383561643836e-05, "loss": 1.6124, "step": 589 }, { "epoch": 0.2020115557457736, "grad_norm": 0.13723698258399963, "learning_rate": 3.98972602739726e-05, "loss": 1.7289, "step": 590 }, { "epoch": 0.20235394821313932, "grad_norm": 0.13015615940093994, "learning_rate": 3.988013698630137e-05, "loss": 1.6502, "step": 591 }, { "epoch": 0.20269634068050502, "grad_norm": 0.1375039517879486, "learning_rate": 3.9863013698630135e-05, "loss": 1.6967, "step": 592 }, { "epoch": 0.20303873314787074, "grad_norm": 0.13820691406726837, "learning_rate": 3.984589041095891e-05, "loss": 1.6261, "step": 593 }, { "epoch": 0.20338112561523647, "grad_norm": 0.13150554895401, "learning_rate": 3.9828767123287675e-05, "loss": 1.5769, "step": 594 }, { "epoch": 0.2037235180826022, "grad_norm": 0.13776059448719025, "learning_rate": 3.981164383561644e-05, "loss": 1.5709, "step": 595 }, { "epoch": 0.2040659105499679, "grad_norm": 0.13858939707279205, "learning_rate": 3.979452054794521e-05, "loss": 1.6376, "step": 596 }, { "epoch": 0.20440830301733362, "grad_norm": 0.12972529232501984, "learning_rate": 3.9777397260273975e-05, "loss": 1.6563, "step": 597 }, { "epoch": 0.20475069548469935, "grad_norm": 0.14410927891731262, "learning_rate": 3.976027397260274e-05, "loss": 1.648, "step": 598 }, { "epoch": 0.20509308795206505, "grad_norm": 0.12447214871644974, "learning_rate": 3.974315068493151e-05, "loss": 1.5406, "step": 599 }, { "epoch": 0.20543548041943077, "grad_norm": 0.15367960929870605, "learning_rate": 3.9726027397260274e-05, "loss": 1.7379, "step": 600 }, { "epoch": 0.2057778728867965, "grad_norm": 0.22347375750541687, "learning_rate": 3.970890410958905e-05, "loss": 1.5542, "step": 601 }, { "epoch": 0.2061202653541622, "grad_norm": 0.13131766021251678, "learning_rate": 3.9691780821917814e-05, "loss": 1.6012, "step": 602 }, { "epoch": 0.20646265782152792, "grad_norm": 0.1466413140296936, "learning_rate": 3.967465753424658e-05, "loss": 1.6099, "step": 603 }, { "epoch": 0.20680505028889365, "grad_norm": 0.13556718826293945, "learning_rate": 3.965753424657535e-05, "loss": 1.543, "step": 604 }, { "epoch": 0.20714744275625938, "grad_norm": 0.1420261263847351, "learning_rate": 3.9640410958904114e-05, "loss": 1.6926, "step": 605 }, { "epoch": 0.20748983522362507, "grad_norm": 0.1490192860364914, "learning_rate": 3.962328767123288e-05, "loss": 1.7365, "step": 606 }, { "epoch": 0.2078322276909908, "grad_norm": 0.13910427689552307, "learning_rate": 3.960616438356165e-05, "loss": 1.5896, "step": 607 }, { "epoch": 0.20817462015835653, "grad_norm": 0.16693435609340668, "learning_rate": 3.958904109589041e-05, "loss": 1.6975, "step": 608 }, { "epoch": 0.20851701262572223, "grad_norm": 0.1282617300748825, "learning_rate": 3.957191780821918e-05, "loss": 1.6155, "step": 609 }, { "epoch": 0.20885940509308795, "grad_norm": 0.155488058924675, "learning_rate": 3.9554794520547946e-05, "loss": 1.7388, "step": 610 }, { "epoch": 0.20920179756045368, "grad_norm": 0.1439526379108429, "learning_rate": 3.953767123287671e-05, "loss": 1.5999, "step": 611 }, { "epoch": 0.20954419002781938, "grad_norm": 0.1459081768989563, "learning_rate": 3.952054794520548e-05, "loss": 1.6998, "step": 612 }, { "epoch": 0.2098865824951851, "grad_norm": 0.13301920890808105, "learning_rate": 3.9503424657534246e-05, "loss": 1.6468, "step": 613 }, { "epoch": 0.21022897496255083, "grad_norm": 0.14514105021953583, "learning_rate": 3.948630136986301e-05, "loss": 1.7329, "step": 614 }, { "epoch": 0.21057136742991653, "grad_norm": 0.1392965018749237, "learning_rate": 3.946917808219178e-05, "loss": 1.6817, "step": 615 }, { "epoch": 0.21091375989728225, "grad_norm": 0.14206695556640625, "learning_rate": 3.9452054794520546e-05, "loss": 1.6095, "step": 616 }, { "epoch": 0.21125615236464798, "grad_norm": 0.13799318671226501, "learning_rate": 3.943493150684931e-05, "loss": 1.6309, "step": 617 }, { "epoch": 0.2115985448320137, "grad_norm": 0.14629817008972168, "learning_rate": 3.941780821917808e-05, "loss": 1.72, "step": 618 }, { "epoch": 0.2119409372993794, "grad_norm": 0.1475141942501068, "learning_rate": 3.940068493150685e-05, "loss": 1.6685, "step": 619 }, { "epoch": 0.21228332976674513, "grad_norm": 0.14562582969665527, "learning_rate": 3.938356164383562e-05, "loss": 1.6719, "step": 620 }, { "epoch": 0.21262572223411086, "grad_norm": 0.13261473178863525, "learning_rate": 3.9366438356164385e-05, "loss": 1.6561, "step": 621 }, { "epoch": 0.21296811470147656, "grad_norm": 0.13725650310516357, "learning_rate": 3.934931506849315e-05, "loss": 1.6783, "step": 622 }, { "epoch": 0.21331050716884228, "grad_norm": 0.14170540869235992, "learning_rate": 3.933219178082192e-05, "loss": 1.6898, "step": 623 }, { "epoch": 0.213652899636208, "grad_norm": 0.14215661585330963, "learning_rate": 3.9315068493150684e-05, "loss": 1.6479, "step": 624 }, { "epoch": 0.2139952921035737, "grad_norm": 0.1352236419916153, "learning_rate": 3.929794520547945e-05, "loss": 1.6335, "step": 625 }, { "epoch": 0.21433768457093944, "grad_norm": 0.14223045110702515, "learning_rate": 3.928082191780822e-05, "loss": 1.6561, "step": 626 }, { "epoch": 0.21468007703830516, "grad_norm": 0.2001626342535019, "learning_rate": 3.9263698630136984e-05, "loss": 1.6065, "step": 627 }, { "epoch": 0.2150224695056709, "grad_norm": 0.14493198692798615, "learning_rate": 3.924657534246576e-05, "loss": 1.7505, "step": 628 }, { "epoch": 0.2153648619730366, "grad_norm": 0.1474265605211258, "learning_rate": 3.9229452054794524e-05, "loss": 1.6188, "step": 629 }, { "epoch": 0.2157072544404023, "grad_norm": 0.15716838836669922, "learning_rate": 3.921232876712329e-05, "loss": 1.6628, "step": 630 }, { "epoch": 0.21604964690776804, "grad_norm": 0.13108830153942108, "learning_rate": 3.919520547945206e-05, "loss": 1.6461, "step": 631 }, { "epoch": 0.21639203937513374, "grad_norm": 0.13984955847263336, "learning_rate": 3.9178082191780823e-05, "loss": 1.5721, "step": 632 }, { "epoch": 0.21673443184249946, "grad_norm": 0.134047269821167, "learning_rate": 3.916095890410959e-05, "loss": 1.6454, "step": 633 }, { "epoch": 0.2170768243098652, "grad_norm": 0.14849083125591278, "learning_rate": 3.9143835616438356e-05, "loss": 1.6118, "step": 634 }, { "epoch": 0.2174192167772309, "grad_norm": 0.15157556533813477, "learning_rate": 3.912671232876712e-05, "loss": 1.6041, "step": 635 }, { "epoch": 0.21776160924459662, "grad_norm": 0.13329258561134338, "learning_rate": 3.910958904109589e-05, "loss": 1.6316, "step": 636 }, { "epoch": 0.21810400171196234, "grad_norm": 0.13909433782100677, "learning_rate": 3.909246575342466e-05, "loss": 1.5768, "step": 637 }, { "epoch": 0.21844639417932807, "grad_norm": 0.1404629796743393, "learning_rate": 3.907534246575343e-05, "loss": 1.6416, "step": 638 }, { "epoch": 0.21878878664669377, "grad_norm": 0.1444345861673355, "learning_rate": 3.9058219178082196e-05, "loss": 1.7654, "step": 639 }, { "epoch": 0.2191311791140595, "grad_norm": 0.13926610350608826, "learning_rate": 3.904109589041096e-05, "loss": 1.6494, "step": 640 }, { "epoch": 0.21947357158142522, "grad_norm": 0.14382143318653107, "learning_rate": 3.902397260273973e-05, "loss": 1.6959, "step": 641 }, { "epoch": 0.21981596404879092, "grad_norm": 0.13799475133419037, "learning_rate": 3.9006849315068495e-05, "loss": 1.5769, "step": 642 }, { "epoch": 0.22015835651615664, "grad_norm": 0.12995094060897827, "learning_rate": 3.898972602739726e-05, "loss": 1.7172, "step": 643 }, { "epoch": 0.22050074898352237, "grad_norm": 0.13802285492420197, "learning_rate": 3.897260273972603e-05, "loss": 1.6938, "step": 644 }, { "epoch": 0.22084314145088807, "grad_norm": 0.14463645219802856, "learning_rate": 3.8955479452054795e-05, "loss": 1.7506, "step": 645 }, { "epoch": 0.2211855339182538, "grad_norm": 0.13552460074424744, "learning_rate": 3.893835616438357e-05, "loss": 1.6821, "step": 646 }, { "epoch": 0.22152792638561952, "grad_norm": 0.14981040358543396, "learning_rate": 3.8921232876712335e-05, "loss": 1.5814, "step": 647 }, { "epoch": 0.22187031885298522, "grad_norm": 0.15010082721710205, "learning_rate": 3.89041095890411e-05, "loss": 1.6438, "step": 648 }, { "epoch": 0.22221271132035095, "grad_norm": 0.14407560229301453, "learning_rate": 3.888698630136987e-05, "loss": 1.5631, "step": 649 }, { "epoch": 0.22255510378771667, "grad_norm": 0.13496525585651398, "learning_rate": 3.8869863013698634e-05, "loss": 1.7092, "step": 650 }, { "epoch": 0.2228974962550824, "grad_norm": 0.16363634169101715, "learning_rate": 3.88527397260274e-05, "loss": 1.682, "step": 651 }, { "epoch": 0.2232398887224481, "grad_norm": 0.1451285183429718, "learning_rate": 3.883561643835617e-05, "loss": 1.5996, "step": 652 }, { "epoch": 0.22358228118981383, "grad_norm": 0.13554857671260834, "learning_rate": 3.8818493150684934e-05, "loss": 1.6393, "step": 653 }, { "epoch": 0.22392467365717955, "grad_norm": 0.13573099672794342, "learning_rate": 3.88013698630137e-05, "loss": 1.7126, "step": 654 }, { "epoch": 0.22426706612454525, "grad_norm": 0.1543813943862915, "learning_rate": 3.878424657534247e-05, "loss": 1.6471, "step": 655 }, { "epoch": 0.22460945859191098, "grad_norm": 0.14468485116958618, "learning_rate": 3.8767123287671233e-05, "loss": 1.6285, "step": 656 }, { "epoch": 0.2249518510592767, "grad_norm": 0.14663173258304596, "learning_rate": 3.875e-05, "loss": 1.6215, "step": 657 }, { "epoch": 0.2252942435266424, "grad_norm": 0.1912306249141693, "learning_rate": 3.8732876712328767e-05, "loss": 1.7885, "step": 658 }, { "epoch": 0.22563663599400813, "grad_norm": 0.1563362181186676, "learning_rate": 3.871575342465753e-05, "loss": 1.7372, "step": 659 }, { "epoch": 0.22597902846137385, "grad_norm": 0.1565442532300949, "learning_rate": 3.86986301369863e-05, "loss": 1.5508, "step": 660 }, { "epoch": 0.22632142092873958, "grad_norm": 0.13743026554584503, "learning_rate": 3.8681506849315066e-05, "loss": 1.5849, "step": 661 }, { "epoch": 0.22666381339610528, "grad_norm": 0.13612733781337738, "learning_rate": 3.866438356164383e-05, "loss": 1.5983, "step": 662 }, { "epoch": 0.227006205863471, "grad_norm": 0.14606739580631256, "learning_rate": 3.86472602739726e-05, "loss": 1.6067, "step": 663 }, { "epoch": 0.22734859833083673, "grad_norm": 0.14170069992542267, "learning_rate": 3.863013698630137e-05, "loss": 1.692, "step": 664 }, { "epoch": 0.22769099079820243, "grad_norm": 0.13603298366069794, "learning_rate": 3.861301369863014e-05, "loss": 1.6799, "step": 665 }, { "epoch": 0.22803338326556816, "grad_norm": 0.15212753415107727, "learning_rate": 3.8595890410958905e-05, "loss": 1.6594, "step": 666 }, { "epoch": 0.22837577573293388, "grad_norm": 0.15311108529567719, "learning_rate": 3.857876712328767e-05, "loss": 1.5701, "step": 667 }, { "epoch": 0.22871816820029958, "grad_norm": 0.14659561216831207, "learning_rate": 3.856164383561644e-05, "loss": 1.4964, "step": 668 }, { "epoch": 0.2290605606676653, "grad_norm": 0.14378193020820618, "learning_rate": 3.8544520547945205e-05, "loss": 1.593, "step": 669 }, { "epoch": 0.22940295313503103, "grad_norm": 0.16992813348770142, "learning_rate": 3.852739726027397e-05, "loss": 1.6686, "step": 670 }, { "epoch": 0.22974534560239676, "grad_norm": 0.14795708656311035, "learning_rate": 3.851027397260274e-05, "loss": 1.6124, "step": 671 }, { "epoch": 0.23008773806976246, "grad_norm": 0.14915013313293457, "learning_rate": 3.8493150684931505e-05, "loss": 1.6028, "step": 672 }, { "epoch": 0.23043013053712819, "grad_norm": 0.14211779832839966, "learning_rate": 3.847602739726028e-05, "loss": 1.6201, "step": 673 }, { "epoch": 0.2307725230044939, "grad_norm": 0.14301882684230804, "learning_rate": 3.8458904109589044e-05, "loss": 1.6391, "step": 674 }, { "epoch": 0.2311149154718596, "grad_norm": 0.14953966438770294, "learning_rate": 3.844178082191781e-05, "loss": 1.7409, "step": 675 }, { "epoch": 0.23145730793922534, "grad_norm": 0.15473328530788422, "learning_rate": 3.842465753424658e-05, "loss": 1.6271, "step": 676 }, { "epoch": 0.23179970040659106, "grad_norm": 0.15373215079307556, "learning_rate": 3.8407534246575344e-05, "loss": 1.5807, "step": 677 }, { "epoch": 0.23214209287395676, "grad_norm": 0.15602236986160278, "learning_rate": 3.839041095890411e-05, "loss": 1.699, "step": 678 }, { "epoch": 0.2324844853413225, "grad_norm": 0.14412091672420502, "learning_rate": 3.837328767123288e-05, "loss": 1.6315, "step": 679 }, { "epoch": 0.23282687780868822, "grad_norm": 0.1419534683227539, "learning_rate": 3.8356164383561644e-05, "loss": 1.6918, "step": 680 }, { "epoch": 0.2331692702760539, "grad_norm": 0.14349767565727234, "learning_rate": 3.833904109589041e-05, "loss": 1.5371, "step": 681 }, { "epoch": 0.23351166274341964, "grad_norm": 0.1385231465101242, "learning_rate": 3.832191780821918e-05, "loss": 1.5593, "step": 682 }, { "epoch": 0.23385405521078537, "grad_norm": 0.16282771527767181, "learning_rate": 3.830479452054795e-05, "loss": 1.6065, "step": 683 }, { "epoch": 0.2341964476781511, "grad_norm": 0.1438712477684021, "learning_rate": 3.8287671232876716e-05, "loss": 1.509, "step": 684 }, { "epoch": 0.2345388401455168, "grad_norm": 0.14997415244579315, "learning_rate": 3.827054794520548e-05, "loss": 1.6455, "step": 685 }, { "epoch": 0.23488123261288252, "grad_norm": 0.1628510057926178, "learning_rate": 3.825342465753425e-05, "loss": 1.7361, "step": 686 }, { "epoch": 0.23522362508024824, "grad_norm": 0.1519729644060135, "learning_rate": 3.8236301369863016e-05, "loss": 1.5683, "step": 687 }, { "epoch": 0.23556601754761394, "grad_norm": 0.14045792818069458, "learning_rate": 3.821917808219178e-05, "loss": 1.6577, "step": 688 }, { "epoch": 0.23590841001497967, "grad_norm": 0.17129874229431152, "learning_rate": 3.820205479452055e-05, "loss": 1.5963, "step": 689 }, { "epoch": 0.2362508024823454, "grad_norm": 0.14787055552005768, "learning_rate": 3.818493150684932e-05, "loss": 1.7469, "step": 690 }, { "epoch": 0.2365931949497111, "grad_norm": 0.15533322095870972, "learning_rate": 3.816780821917809e-05, "loss": 1.6276, "step": 691 }, { "epoch": 0.23693558741707682, "grad_norm": 0.13992665708065033, "learning_rate": 3.8150684931506855e-05, "loss": 1.5987, "step": 692 }, { "epoch": 0.23727797988444255, "grad_norm": 0.1597602814435959, "learning_rate": 3.813356164383562e-05, "loss": 1.6602, "step": 693 }, { "epoch": 0.23762037235180827, "grad_norm": 0.16671034693717957, "learning_rate": 3.811643835616439e-05, "loss": 1.6219, "step": 694 }, { "epoch": 0.23796276481917397, "grad_norm": 0.18046355247497559, "learning_rate": 3.8099315068493155e-05, "loss": 1.6808, "step": 695 }, { "epoch": 0.2383051572865397, "grad_norm": 0.14173193275928497, "learning_rate": 3.808219178082192e-05, "loss": 1.7172, "step": 696 }, { "epoch": 0.23864754975390542, "grad_norm": 0.14578492939472198, "learning_rate": 3.806506849315069e-05, "loss": 1.6258, "step": 697 }, { "epoch": 0.23898994222127112, "grad_norm": 0.1423153579235077, "learning_rate": 3.8047945205479454e-05, "loss": 1.6232, "step": 698 }, { "epoch": 0.23933233468863685, "grad_norm": 0.1410105973482132, "learning_rate": 3.803082191780822e-05, "loss": 1.5587, "step": 699 }, { "epoch": 0.23967472715600258, "grad_norm": 0.146913081407547, "learning_rate": 3.801369863013699e-05, "loss": 1.7267, "step": 700 }, { "epoch": 0.24001711962336827, "grad_norm": 0.16355788707733154, "learning_rate": 3.7996575342465754e-05, "loss": 1.6365, "step": 701 }, { "epoch": 0.240359512090734, "grad_norm": 0.15799741446971893, "learning_rate": 3.797945205479452e-05, "loss": 1.6562, "step": 702 }, { "epoch": 0.24070190455809973, "grad_norm": 0.14790627360343933, "learning_rate": 3.796232876712329e-05, "loss": 1.5573, "step": 703 }, { "epoch": 0.24104429702546543, "grad_norm": 0.145141139626503, "learning_rate": 3.7945205479452054e-05, "loss": 1.533, "step": 704 }, { "epoch": 0.24138668949283115, "grad_norm": 0.16019876301288605, "learning_rate": 3.792808219178082e-05, "loss": 1.645, "step": 705 }, { "epoch": 0.24172908196019688, "grad_norm": 0.15267132222652435, "learning_rate": 3.791095890410959e-05, "loss": 1.5862, "step": 706 }, { "epoch": 0.2420714744275626, "grad_norm": 0.14816991984844208, "learning_rate": 3.789383561643835e-05, "loss": 1.6415, "step": 707 }, { "epoch": 0.2424138668949283, "grad_norm": 0.1378215253353119, "learning_rate": 3.7876712328767126e-05, "loss": 1.5368, "step": 708 }, { "epoch": 0.24275625936229403, "grad_norm": 0.1557566523551941, "learning_rate": 3.785958904109589e-05, "loss": 1.6652, "step": 709 }, { "epoch": 0.24309865182965976, "grad_norm": 0.16050609946250916, "learning_rate": 3.784246575342466e-05, "loss": 1.6451, "step": 710 }, { "epoch": 0.24344104429702546, "grad_norm": 0.15321049094200134, "learning_rate": 3.7825342465753426e-05, "loss": 1.466, "step": 711 }, { "epoch": 0.24378343676439118, "grad_norm": 0.14668038487434387, "learning_rate": 3.780821917808219e-05, "loss": 1.6474, "step": 712 }, { "epoch": 0.2441258292317569, "grad_norm": 0.1431702971458435, "learning_rate": 3.779109589041096e-05, "loss": 1.6004, "step": 713 }, { "epoch": 0.2444682216991226, "grad_norm": 0.17158308625221252, "learning_rate": 3.7773972602739726e-05, "loss": 1.6922, "step": 714 }, { "epoch": 0.24481061416648833, "grad_norm": 0.16257373988628387, "learning_rate": 3.775684931506849e-05, "loss": 1.731, "step": 715 }, { "epoch": 0.24515300663385406, "grad_norm": 0.16495251655578613, "learning_rate": 3.773972602739726e-05, "loss": 1.7119, "step": 716 }, { "epoch": 0.24549539910121979, "grad_norm": 0.14999692142009735, "learning_rate": 3.772260273972603e-05, "loss": 1.5363, "step": 717 }, { "epoch": 0.24583779156858548, "grad_norm": 0.1477193683385849, "learning_rate": 3.77054794520548e-05, "loss": 1.6404, "step": 718 }, { "epoch": 0.2461801840359512, "grad_norm": 0.16510744392871857, "learning_rate": 3.7688356164383565e-05, "loss": 1.6169, "step": 719 }, { "epoch": 0.24652257650331694, "grad_norm": 0.1509714126586914, "learning_rate": 3.767123287671233e-05, "loss": 1.6505, "step": 720 }, { "epoch": 0.24686496897068264, "grad_norm": 0.16706955432891846, "learning_rate": 3.76541095890411e-05, "loss": 1.6201, "step": 721 }, { "epoch": 0.24720736143804836, "grad_norm": 0.1485087126493454, "learning_rate": 3.7636986301369865e-05, "loss": 1.5435, "step": 722 }, { "epoch": 0.2475497539054141, "grad_norm": 0.1420665830373764, "learning_rate": 3.761986301369863e-05, "loss": 1.6949, "step": 723 }, { "epoch": 0.2478921463727798, "grad_norm": 0.15542322397232056, "learning_rate": 3.76027397260274e-05, "loss": 1.6369, "step": 724 }, { "epoch": 0.2482345388401455, "grad_norm": 0.16503387689590454, "learning_rate": 3.7585616438356164e-05, "loss": 1.6571, "step": 725 }, { "epoch": 0.24857693130751124, "grad_norm": 0.1691586971282959, "learning_rate": 3.756849315068494e-05, "loss": 1.7335, "step": 726 }, { "epoch": 0.24891932377487697, "grad_norm": 0.16520081460475922, "learning_rate": 3.7551369863013704e-05, "loss": 1.5644, "step": 727 }, { "epoch": 0.24926171624224266, "grad_norm": 0.13477402925491333, "learning_rate": 3.753424657534247e-05, "loss": 1.5839, "step": 728 }, { "epoch": 0.2496041087096084, "grad_norm": 0.16453130543231964, "learning_rate": 3.751712328767124e-05, "loss": 1.7474, "step": 729 }, { "epoch": 0.24994650117697412, "grad_norm": 0.21137817203998566, "learning_rate": 3.7500000000000003e-05, "loss": 1.5475, "step": 730 }, { "epoch": 0.2502888936443398, "grad_norm": 0.14837858080863953, "learning_rate": 3.748287671232877e-05, "loss": 1.6171, "step": 731 }, { "epoch": 0.25063128611170554, "grad_norm": 0.16185030341148376, "learning_rate": 3.7465753424657537e-05, "loss": 1.5949, "step": 732 }, { "epoch": 0.25097367857907127, "grad_norm": 0.1483171433210373, "learning_rate": 3.74486301369863e-05, "loss": 1.6638, "step": 733 }, { "epoch": 0.251316071046437, "grad_norm": 0.1455860435962677, "learning_rate": 3.743150684931507e-05, "loss": 1.6051, "step": 734 }, { "epoch": 0.2516584635138027, "grad_norm": 0.15174081921577454, "learning_rate": 3.741438356164384e-05, "loss": 1.6667, "step": 735 }, { "epoch": 0.2520008559811684, "grad_norm": 0.1445140242576599, "learning_rate": 3.739726027397261e-05, "loss": 1.555, "step": 736 }, { "epoch": 0.2523432484485341, "grad_norm": 0.17341762781143188, "learning_rate": 3.7380136986301376e-05, "loss": 1.6623, "step": 737 }, { "epoch": 0.25268564091589985, "grad_norm": 0.145102396607399, "learning_rate": 3.736301369863014e-05, "loss": 1.6106, "step": 738 }, { "epoch": 0.25302803338326557, "grad_norm": 0.15110574662685394, "learning_rate": 3.734589041095891e-05, "loss": 1.7235, "step": 739 }, { "epoch": 0.2533704258506313, "grad_norm": 0.15728889405727386, "learning_rate": 3.7328767123287675e-05, "loss": 1.6595, "step": 740 }, { "epoch": 0.253712818317997, "grad_norm": 0.17314192652702332, "learning_rate": 3.731164383561644e-05, "loss": 1.6003, "step": 741 }, { "epoch": 0.2540552107853627, "grad_norm": 0.1419299840927124, "learning_rate": 3.729452054794521e-05, "loss": 1.5822, "step": 742 }, { "epoch": 0.2543976032527284, "grad_norm": 0.1432449072599411, "learning_rate": 3.7277397260273975e-05, "loss": 1.5905, "step": 743 }, { "epoch": 0.25473999572009415, "grad_norm": 0.15796010196208954, "learning_rate": 3.726027397260274e-05, "loss": 1.6432, "step": 744 }, { "epoch": 0.2550823881874599, "grad_norm": 0.17300905287265778, "learning_rate": 3.724315068493151e-05, "loss": 1.7253, "step": 745 }, { "epoch": 0.2554247806548256, "grad_norm": 0.16232259571552277, "learning_rate": 3.7226027397260275e-05, "loss": 1.5824, "step": 746 }, { "epoch": 0.2557671731221913, "grad_norm": 0.1420726329088211, "learning_rate": 3.720890410958904e-05, "loss": 1.568, "step": 747 }, { "epoch": 0.25610956558955705, "grad_norm": 0.14220117032527924, "learning_rate": 3.719178082191781e-05, "loss": 1.6031, "step": 748 }, { "epoch": 0.2564519580569227, "grad_norm": 0.16422894597053528, "learning_rate": 3.7174657534246574e-05, "loss": 1.6206, "step": 749 }, { "epoch": 0.25679435052428845, "grad_norm": 0.15555799007415771, "learning_rate": 3.715753424657534e-05, "loss": 1.6378, "step": 750 }, { "epoch": 0.2571367429916542, "grad_norm": 0.14382526278495789, "learning_rate": 3.714041095890411e-05, "loss": 1.557, "step": 751 }, { "epoch": 0.2574791354590199, "grad_norm": 0.15674228966236115, "learning_rate": 3.7123287671232874e-05, "loss": 1.6877, "step": 752 }, { "epoch": 0.25782152792638563, "grad_norm": 0.1501520425081253, "learning_rate": 3.710616438356165e-05, "loss": 1.6146, "step": 753 }, { "epoch": 0.25816392039375136, "grad_norm": 0.17455165088176727, "learning_rate": 3.7089041095890414e-05, "loss": 1.6339, "step": 754 }, { "epoch": 0.2585063128611171, "grad_norm": 0.15864112973213196, "learning_rate": 3.707191780821918e-05, "loss": 1.5009, "step": 755 }, { "epoch": 0.25884870532848275, "grad_norm": 0.15661512315273285, "learning_rate": 3.7054794520547947e-05, "loss": 1.66, "step": 756 }, { "epoch": 0.2591910977958485, "grad_norm": 0.1677456498146057, "learning_rate": 3.703767123287671e-05, "loss": 1.6812, "step": 757 }, { "epoch": 0.2595334902632142, "grad_norm": 0.15673485398292542, "learning_rate": 3.702054794520548e-05, "loss": 1.5004, "step": 758 }, { "epoch": 0.25987588273057993, "grad_norm": 0.1495063155889511, "learning_rate": 3.7003424657534246e-05, "loss": 1.6824, "step": 759 }, { "epoch": 0.26021827519794566, "grad_norm": 0.17794835567474365, "learning_rate": 3.698630136986301e-05, "loss": 1.6953, "step": 760 }, { "epoch": 0.2605606676653114, "grad_norm": 0.16137944161891937, "learning_rate": 3.696917808219178e-05, "loss": 1.7007, "step": 761 }, { "epoch": 0.26090306013267706, "grad_norm": 0.15399391949176788, "learning_rate": 3.695205479452055e-05, "loss": 1.6491, "step": 762 }, { "epoch": 0.2612454526000428, "grad_norm": 0.18218344449996948, "learning_rate": 3.693493150684932e-05, "loss": 1.5993, "step": 763 }, { "epoch": 0.2615878450674085, "grad_norm": 0.15110883116722107, "learning_rate": 3.6917808219178086e-05, "loss": 1.658, "step": 764 }, { "epoch": 0.26193023753477424, "grad_norm": 0.16530990600585938, "learning_rate": 3.690068493150685e-05, "loss": 1.5591, "step": 765 }, { "epoch": 0.26227263000213996, "grad_norm": 0.16281379759311676, "learning_rate": 3.688356164383562e-05, "loss": 1.617, "step": 766 }, { "epoch": 0.2626150224695057, "grad_norm": 0.163553848862648, "learning_rate": 3.6866438356164385e-05, "loss": 1.6893, "step": 767 }, { "epoch": 0.2629574149368714, "grad_norm": 0.16212999820709229, "learning_rate": 3.684931506849315e-05, "loss": 1.5985, "step": 768 }, { "epoch": 0.2632998074042371, "grad_norm": 0.14513254165649414, "learning_rate": 3.683219178082192e-05, "loss": 1.6678, "step": 769 }, { "epoch": 0.2636421998716028, "grad_norm": 0.15693022310733795, "learning_rate": 3.6815068493150685e-05, "loss": 1.6254, "step": 770 }, { "epoch": 0.26398459233896854, "grad_norm": 0.15066470205783844, "learning_rate": 3.679794520547946e-05, "loss": 1.4757, "step": 771 }, { "epoch": 0.26432698480633426, "grad_norm": 0.1564892679452896, "learning_rate": 3.6780821917808224e-05, "loss": 1.6755, "step": 772 }, { "epoch": 0.2646693772737, "grad_norm": 0.21559260785579681, "learning_rate": 3.676369863013699e-05, "loss": 1.6298, "step": 773 }, { "epoch": 0.2650117697410657, "grad_norm": 0.1554337441921234, "learning_rate": 3.674657534246576e-05, "loss": 1.6346, "step": 774 }, { "epoch": 0.2653541622084314, "grad_norm": 0.15218661725521088, "learning_rate": 3.6729452054794524e-05, "loss": 1.7261, "step": 775 }, { "epoch": 0.2656965546757971, "grad_norm": 0.1601582020521164, "learning_rate": 3.671232876712329e-05, "loss": 1.6592, "step": 776 }, { "epoch": 0.26603894714316284, "grad_norm": 0.1528974026441574, "learning_rate": 3.669520547945206e-05, "loss": 1.6033, "step": 777 }, { "epoch": 0.26638133961052857, "grad_norm": 0.14803509414196014, "learning_rate": 3.6678082191780824e-05, "loss": 1.7176, "step": 778 }, { "epoch": 0.2667237320778943, "grad_norm": 0.16880981624126434, "learning_rate": 3.66609589041096e-05, "loss": 1.5434, "step": 779 }, { "epoch": 0.26706612454526, "grad_norm": 0.15242914855480194, "learning_rate": 3.664383561643836e-05, "loss": 1.699, "step": 780 }, { "epoch": 0.26740851701262575, "grad_norm": 0.16456468403339386, "learning_rate": 3.662671232876712e-05, "loss": 1.6805, "step": 781 }, { "epoch": 0.2677509094799914, "grad_norm": 0.19142799079418182, "learning_rate": 3.660958904109589e-05, "loss": 1.6435, "step": 782 }, { "epoch": 0.26809330194735714, "grad_norm": 0.16038298606872559, "learning_rate": 3.6592465753424656e-05, "loss": 1.6827, "step": 783 }, { "epoch": 0.26843569441472287, "grad_norm": 0.1598176658153534, "learning_rate": 3.657534246575342e-05, "loss": 1.7358, "step": 784 }, { "epoch": 0.2687780868820886, "grad_norm": 0.16843055188655853, "learning_rate": 3.655821917808219e-05, "loss": 1.6581, "step": 785 }, { "epoch": 0.2691204793494543, "grad_norm": 0.17677222192287445, "learning_rate": 3.6541095890410956e-05, "loss": 1.6672, "step": 786 }, { "epoch": 0.26946287181682005, "grad_norm": 0.16428817808628082, "learning_rate": 3.652397260273972e-05, "loss": 1.6041, "step": 787 }, { "epoch": 0.2698052642841858, "grad_norm": 0.16790367662906647, "learning_rate": 3.6506849315068496e-05, "loss": 1.6609, "step": 788 }, { "epoch": 0.27014765675155145, "grad_norm": 0.15226012468338013, "learning_rate": 3.648972602739726e-05, "loss": 1.6705, "step": 789 }, { "epoch": 0.2704900492189172, "grad_norm": 0.1617858111858368, "learning_rate": 3.647260273972603e-05, "loss": 1.6802, "step": 790 }, { "epoch": 0.2708324416862829, "grad_norm": 0.17529425024986267, "learning_rate": 3.6455479452054795e-05, "loss": 1.5855, "step": 791 }, { "epoch": 0.2711748341536486, "grad_norm": 0.14928236603736877, "learning_rate": 3.643835616438356e-05, "loss": 1.6189, "step": 792 }, { "epoch": 0.27151722662101435, "grad_norm": 0.15558721125125885, "learning_rate": 3.642123287671233e-05, "loss": 1.6418, "step": 793 }, { "epoch": 0.2718596190883801, "grad_norm": 0.16190998256206512, "learning_rate": 3.6404109589041095e-05, "loss": 1.5969, "step": 794 }, { "epoch": 0.27220201155574575, "grad_norm": 0.16861742734909058, "learning_rate": 3.638698630136986e-05, "loss": 1.667, "step": 795 }, { "epoch": 0.2725444040231115, "grad_norm": 0.1444847732782364, "learning_rate": 3.636986301369863e-05, "loss": 1.599, "step": 796 }, { "epoch": 0.2728867964904772, "grad_norm": 0.17324218153953552, "learning_rate": 3.63527397260274e-05, "loss": 1.5957, "step": 797 }, { "epoch": 0.27322918895784293, "grad_norm": 0.16928315162658691, "learning_rate": 3.633561643835617e-05, "loss": 1.6549, "step": 798 }, { "epoch": 0.27357158142520865, "grad_norm": 0.16372160613536835, "learning_rate": 3.6318493150684934e-05, "loss": 1.7021, "step": 799 }, { "epoch": 0.2739139738925744, "grad_norm": 0.16876105964183807, "learning_rate": 3.63013698630137e-05, "loss": 1.6666, "step": 800 }, { "epoch": 0.2742563663599401, "grad_norm": 0.17495477199554443, "learning_rate": 3.628424657534247e-05, "loss": 1.6217, "step": 801 }, { "epoch": 0.2745987588273058, "grad_norm": 0.15437735617160797, "learning_rate": 3.6267123287671234e-05, "loss": 1.6052, "step": 802 }, { "epoch": 0.2749411512946715, "grad_norm": 0.15686242282390594, "learning_rate": 3.625e-05, "loss": 1.6343, "step": 803 }, { "epoch": 0.27528354376203723, "grad_norm": 0.16484378278255463, "learning_rate": 3.623287671232877e-05, "loss": 1.6474, "step": 804 }, { "epoch": 0.27562593622940296, "grad_norm": 0.16816413402557373, "learning_rate": 3.621575342465753e-05, "loss": 1.7258, "step": 805 }, { "epoch": 0.2759683286967687, "grad_norm": 0.15450787544250488, "learning_rate": 3.6198630136986307e-05, "loss": 1.66, "step": 806 }, { "epoch": 0.2763107211641344, "grad_norm": 0.1443817913532257, "learning_rate": 3.618150684931507e-05, "loss": 1.6004, "step": 807 }, { "epoch": 0.2766531136315001, "grad_norm": 0.15754206478595734, "learning_rate": 3.616438356164384e-05, "loss": 1.663, "step": 808 }, { "epoch": 0.2769955060988658, "grad_norm": 0.15453815460205078, "learning_rate": 3.6147260273972606e-05, "loss": 1.6691, "step": 809 }, { "epoch": 0.27733789856623153, "grad_norm": 0.17054972052574158, "learning_rate": 3.613013698630137e-05, "loss": 1.643, "step": 810 }, { "epoch": 0.27768029103359726, "grad_norm": 0.1717730015516281, "learning_rate": 3.611301369863014e-05, "loss": 1.6355, "step": 811 }, { "epoch": 0.278022683500963, "grad_norm": 0.17568092048168182, "learning_rate": 3.6095890410958906e-05, "loss": 1.6784, "step": 812 }, { "epoch": 0.2783650759683287, "grad_norm": 0.16582152247428894, "learning_rate": 3.607876712328767e-05, "loss": 1.6237, "step": 813 }, { "epoch": 0.27870746843569444, "grad_norm": 0.1489517241716385, "learning_rate": 3.606164383561644e-05, "loss": 1.736, "step": 814 }, { "epoch": 0.2790498609030601, "grad_norm": 0.1609610766172409, "learning_rate": 3.604452054794521e-05, "loss": 1.6663, "step": 815 }, { "epoch": 0.27939225337042584, "grad_norm": 0.15740936994552612, "learning_rate": 3.602739726027398e-05, "loss": 1.6791, "step": 816 }, { "epoch": 0.27973464583779156, "grad_norm": 0.16012388467788696, "learning_rate": 3.6010273972602745e-05, "loss": 1.6766, "step": 817 }, { "epoch": 0.2800770383051573, "grad_norm": 0.16326577961444855, "learning_rate": 3.599315068493151e-05, "loss": 1.6725, "step": 818 }, { "epoch": 0.280419430772523, "grad_norm": 0.18713147938251495, "learning_rate": 3.597602739726028e-05, "loss": 1.6743, "step": 819 }, { "epoch": 0.28076182323988874, "grad_norm": 0.1549314707517624, "learning_rate": 3.5958904109589045e-05, "loss": 1.6794, "step": 820 }, { "epoch": 0.28110421570725447, "grad_norm": 0.18216916918754578, "learning_rate": 3.594178082191781e-05, "loss": 1.6843, "step": 821 }, { "epoch": 0.28144660817462014, "grad_norm": 0.15582188963890076, "learning_rate": 3.592465753424658e-05, "loss": 1.6452, "step": 822 }, { "epoch": 0.28178900064198587, "grad_norm": 0.14773592352867126, "learning_rate": 3.5907534246575344e-05, "loss": 1.6265, "step": 823 }, { "epoch": 0.2821313931093516, "grad_norm": 0.16186419129371643, "learning_rate": 3.589041095890411e-05, "loss": 1.6347, "step": 824 }, { "epoch": 0.2824737855767173, "grad_norm": 0.1717757284641266, "learning_rate": 3.587328767123288e-05, "loss": 1.6958, "step": 825 }, { "epoch": 0.28281617804408304, "grad_norm": 0.15420295298099518, "learning_rate": 3.5856164383561644e-05, "loss": 1.6429, "step": 826 }, { "epoch": 0.28315857051144877, "grad_norm": 0.16095183789730072, "learning_rate": 3.583904109589041e-05, "loss": 1.6228, "step": 827 }, { "epoch": 0.28350096297881444, "grad_norm": 0.18052704632282257, "learning_rate": 3.582191780821918e-05, "loss": 1.6428, "step": 828 }, { "epoch": 0.28384335544618017, "grad_norm": 0.15483815968036652, "learning_rate": 3.580479452054794e-05, "loss": 1.634, "step": 829 }, { "epoch": 0.2841857479135459, "grad_norm": 0.17324478924274445, "learning_rate": 3.578767123287671e-05, "loss": 1.672, "step": 830 }, { "epoch": 0.2845281403809116, "grad_norm": 0.15134327113628387, "learning_rate": 3.5770547945205476e-05, "loss": 1.6508, "step": 831 }, { "epoch": 0.28487053284827735, "grad_norm": 0.16642558574676514, "learning_rate": 3.575342465753424e-05, "loss": 1.759, "step": 832 }, { "epoch": 0.2852129253156431, "grad_norm": 0.15847145020961761, "learning_rate": 3.5736301369863016e-05, "loss": 1.4854, "step": 833 }, { "epoch": 0.2855553177830088, "grad_norm": 0.1611274927854538, "learning_rate": 3.571917808219178e-05, "loss": 1.6169, "step": 834 }, { "epoch": 0.28589771025037447, "grad_norm": 0.17530043423175812, "learning_rate": 3.570205479452055e-05, "loss": 1.6159, "step": 835 }, { "epoch": 0.2862401027177402, "grad_norm": 0.1566336303949356, "learning_rate": 3.5684931506849316e-05, "loss": 1.5582, "step": 836 }, { "epoch": 0.2865824951851059, "grad_norm": 0.16092990338802338, "learning_rate": 3.566780821917808e-05, "loss": 1.6591, "step": 837 }, { "epoch": 0.28692488765247165, "grad_norm": 0.16046638786792755, "learning_rate": 3.565068493150685e-05, "loss": 1.5795, "step": 838 }, { "epoch": 0.2872672801198374, "grad_norm": 0.1690056324005127, "learning_rate": 3.5633561643835615e-05, "loss": 1.6783, "step": 839 }, { "epoch": 0.2876096725872031, "grad_norm": 0.18108516931533813, "learning_rate": 3.561643835616438e-05, "loss": 1.6212, "step": 840 }, { "epoch": 0.2879520650545688, "grad_norm": 0.16346172988414764, "learning_rate": 3.559931506849315e-05, "loss": 1.5678, "step": 841 }, { "epoch": 0.2882944575219345, "grad_norm": 0.1752222776412964, "learning_rate": 3.558219178082192e-05, "loss": 1.6415, "step": 842 }, { "epoch": 0.2886368499893002, "grad_norm": 0.15950724482536316, "learning_rate": 3.556506849315069e-05, "loss": 1.6472, "step": 843 }, { "epoch": 0.28897924245666595, "grad_norm": 0.16215822100639343, "learning_rate": 3.5547945205479455e-05, "loss": 1.6559, "step": 844 }, { "epoch": 0.2893216349240317, "grad_norm": 0.17085696756839752, "learning_rate": 3.553082191780822e-05, "loss": 1.7336, "step": 845 }, { "epoch": 0.2896640273913974, "grad_norm": 0.1822553128004074, "learning_rate": 3.551369863013699e-05, "loss": 1.6371, "step": 846 }, { "epoch": 0.29000641985876313, "grad_norm": 0.16732332110404968, "learning_rate": 3.5496575342465754e-05, "loss": 1.6186, "step": 847 }, { "epoch": 0.2903488123261288, "grad_norm": 0.16203734278678894, "learning_rate": 3.547945205479452e-05, "loss": 1.6518, "step": 848 }, { "epoch": 0.29069120479349453, "grad_norm": 0.16522228717803955, "learning_rate": 3.546232876712329e-05, "loss": 1.5956, "step": 849 }, { "epoch": 0.29103359726086026, "grad_norm": 0.16643472015857697, "learning_rate": 3.5445205479452054e-05, "loss": 1.5829, "step": 850 }, { "epoch": 0.291375989728226, "grad_norm": 0.15818536281585693, "learning_rate": 3.542808219178083e-05, "loss": 1.6363, "step": 851 }, { "epoch": 0.2917183821955917, "grad_norm": 0.17062436044216156, "learning_rate": 3.5410958904109594e-05, "loss": 1.6528, "step": 852 }, { "epoch": 0.29206077466295743, "grad_norm": 0.17131488025188446, "learning_rate": 3.539383561643836e-05, "loss": 1.5265, "step": 853 }, { "epoch": 0.29240316713032316, "grad_norm": 0.16368134319782257, "learning_rate": 3.537671232876713e-05, "loss": 1.6914, "step": 854 }, { "epoch": 0.29274555959768883, "grad_norm": 0.16427187621593475, "learning_rate": 3.535958904109589e-05, "loss": 1.6741, "step": 855 }, { "epoch": 0.29308795206505456, "grad_norm": 0.1650792509317398, "learning_rate": 3.534246575342466e-05, "loss": 1.5757, "step": 856 }, { "epoch": 0.2934303445324203, "grad_norm": 0.17051884531974792, "learning_rate": 3.5325342465753426e-05, "loss": 1.6512, "step": 857 }, { "epoch": 0.293772736999786, "grad_norm": 0.16706939041614532, "learning_rate": 3.530821917808219e-05, "loss": 1.7028, "step": 858 }, { "epoch": 0.29411512946715174, "grad_norm": 0.16190500557422638, "learning_rate": 3.529109589041096e-05, "loss": 1.5868, "step": 859 }, { "epoch": 0.29445752193451746, "grad_norm": 0.15885214507579803, "learning_rate": 3.527397260273973e-05, "loss": 1.7112, "step": 860 }, { "epoch": 0.29479991440188313, "grad_norm": 0.16328854858875275, "learning_rate": 3.52568493150685e-05, "loss": 1.6313, "step": 861 }, { "epoch": 0.29514230686924886, "grad_norm": 0.164979487657547, "learning_rate": 3.5239726027397266e-05, "loss": 1.6409, "step": 862 }, { "epoch": 0.2954846993366146, "grad_norm": 0.17458245158195496, "learning_rate": 3.522260273972603e-05, "loss": 1.6854, "step": 863 }, { "epoch": 0.2958270918039803, "grad_norm": 0.21510948240756989, "learning_rate": 3.52054794520548e-05, "loss": 1.6308, "step": 864 }, { "epoch": 0.29616948427134604, "grad_norm": 0.16104109585285187, "learning_rate": 3.5188356164383565e-05, "loss": 1.5827, "step": 865 }, { "epoch": 0.29651187673871177, "grad_norm": 0.1910904347896576, "learning_rate": 3.517123287671233e-05, "loss": 1.601, "step": 866 }, { "epoch": 0.2968542692060775, "grad_norm": 0.16963434219360352, "learning_rate": 3.51541095890411e-05, "loss": 1.6268, "step": 867 }, { "epoch": 0.29719666167344316, "grad_norm": 0.17644238471984863, "learning_rate": 3.5136986301369865e-05, "loss": 1.6594, "step": 868 }, { "epoch": 0.2975390541408089, "grad_norm": 0.15850773453712463, "learning_rate": 3.511986301369863e-05, "loss": 1.6441, "step": 869 }, { "epoch": 0.2978814466081746, "grad_norm": 0.16368292272090912, "learning_rate": 3.51027397260274e-05, "loss": 1.7522, "step": 870 }, { "epoch": 0.29822383907554034, "grad_norm": 0.18427345156669617, "learning_rate": 3.5085616438356164e-05, "loss": 1.7304, "step": 871 }, { "epoch": 0.29856623154290607, "grad_norm": 0.17519167065620422, "learning_rate": 3.506849315068493e-05, "loss": 1.627, "step": 872 }, { "epoch": 0.2989086240102718, "grad_norm": 0.17118513584136963, "learning_rate": 3.50513698630137e-05, "loss": 1.606, "step": 873 }, { "epoch": 0.29925101647763747, "grad_norm": 0.17029987275600433, "learning_rate": 3.5034246575342464e-05, "loss": 1.6869, "step": 874 }, { "epoch": 0.2995934089450032, "grad_norm": 0.15605244040489197, "learning_rate": 3.501712328767123e-05, "loss": 1.5985, "step": 875 }, { "epoch": 0.2999358014123689, "grad_norm": 0.185293048620224, "learning_rate": 3.5e-05, "loss": 1.5509, "step": 876 }, { "epoch": 0.30027819387973465, "grad_norm": 0.16742050647735596, "learning_rate": 3.498287671232877e-05, "loss": 1.7014, "step": 877 }, { "epoch": 0.30062058634710037, "grad_norm": 0.17309749126434326, "learning_rate": 3.496575342465754e-05, "loss": 1.6502, "step": 878 }, { "epoch": 0.3009629788144661, "grad_norm": 0.17015303671360016, "learning_rate": 3.49486301369863e-05, "loss": 1.6898, "step": 879 }, { "epoch": 0.3013053712818318, "grad_norm": 0.15660953521728516, "learning_rate": 3.493150684931507e-05, "loss": 1.6447, "step": 880 }, { "epoch": 0.3016477637491975, "grad_norm": 0.18243244290351868, "learning_rate": 3.4914383561643836e-05, "loss": 1.6197, "step": 881 }, { "epoch": 0.3019901562165632, "grad_norm": 0.16429270803928375, "learning_rate": 3.48972602739726e-05, "loss": 1.5624, "step": 882 }, { "epoch": 0.30233254868392895, "grad_norm": 0.15919050574302673, "learning_rate": 3.488013698630137e-05, "loss": 1.6693, "step": 883 }, { "epoch": 0.3026749411512947, "grad_norm": 0.16490940749645233, "learning_rate": 3.4863013698630136e-05, "loss": 1.6629, "step": 884 }, { "epoch": 0.3030173336186604, "grad_norm": 0.166195347905159, "learning_rate": 3.48458904109589e-05, "loss": 1.6113, "step": 885 }, { "epoch": 0.3033597260860261, "grad_norm": 0.18497268855571747, "learning_rate": 3.4828767123287676e-05, "loss": 1.6698, "step": 886 }, { "epoch": 0.3037021185533918, "grad_norm": 0.15267355740070343, "learning_rate": 3.481164383561644e-05, "loss": 1.6908, "step": 887 }, { "epoch": 0.3040445110207575, "grad_norm": 0.22043322026729584, "learning_rate": 3.479452054794521e-05, "loss": 1.5224, "step": 888 }, { "epoch": 0.30438690348812325, "grad_norm": 0.17637395858764648, "learning_rate": 3.4777397260273975e-05, "loss": 1.5672, "step": 889 }, { "epoch": 0.304729295955489, "grad_norm": 0.17742246389389038, "learning_rate": 3.476027397260274e-05, "loss": 1.6122, "step": 890 }, { "epoch": 0.3050716884228547, "grad_norm": 0.16961202025413513, "learning_rate": 3.474315068493151e-05, "loss": 1.6908, "step": 891 }, { "epoch": 0.30541408089022043, "grad_norm": 0.19535768032073975, "learning_rate": 3.4726027397260275e-05, "loss": 1.7841, "step": 892 }, { "epoch": 0.30575647335758616, "grad_norm": 0.4545200765132904, "learning_rate": 3.470890410958904e-05, "loss": 1.7079, "step": 893 }, { "epoch": 0.3060988658249518, "grad_norm": 0.16562189161777496, "learning_rate": 3.469178082191781e-05, "loss": 1.4545, "step": 894 }, { "epoch": 0.30644125829231755, "grad_norm": 0.1766722947359085, "learning_rate": 3.467465753424658e-05, "loss": 1.6122, "step": 895 }, { "epoch": 0.3067836507596833, "grad_norm": 0.18913449347019196, "learning_rate": 3.465753424657535e-05, "loss": 1.5727, "step": 896 }, { "epoch": 0.307126043227049, "grad_norm": 0.16980838775634766, "learning_rate": 3.4640410958904114e-05, "loss": 1.6259, "step": 897 }, { "epoch": 0.30746843569441473, "grad_norm": 0.1733735203742981, "learning_rate": 3.462328767123288e-05, "loss": 1.5581, "step": 898 }, { "epoch": 0.30781082816178046, "grad_norm": 0.17580045759677887, "learning_rate": 3.460616438356165e-05, "loss": 1.6937, "step": 899 }, { "epoch": 0.3081532206291462, "grad_norm": 0.1636032909154892, "learning_rate": 3.4589041095890414e-05, "loss": 1.6718, "step": 900 }, { "epoch": 0.30849561309651186, "grad_norm": 0.16598966717720032, "learning_rate": 3.457191780821918e-05, "loss": 1.5685, "step": 901 }, { "epoch": 0.3088380055638776, "grad_norm": 0.18820227682590485, "learning_rate": 3.455479452054795e-05, "loss": 1.6241, "step": 902 }, { "epoch": 0.3091803980312433, "grad_norm": 0.18153543770313263, "learning_rate": 3.453767123287671e-05, "loss": 1.7585, "step": 903 }, { "epoch": 0.30952279049860904, "grad_norm": 0.16583392024040222, "learning_rate": 3.452054794520549e-05, "loss": 1.6138, "step": 904 }, { "epoch": 0.30986518296597476, "grad_norm": 0.17588374018669128, "learning_rate": 3.450342465753425e-05, "loss": 1.6014, "step": 905 }, { "epoch": 0.3102075754333405, "grad_norm": 0.15407659113407135, "learning_rate": 3.448630136986302e-05, "loss": 1.5715, "step": 906 }, { "epoch": 0.31054996790070616, "grad_norm": 0.1713164895772934, "learning_rate": 3.4469178082191786e-05, "loss": 1.6921, "step": 907 }, { "epoch": 0.3108923603680719, "grad_norm": 0.17164406180381775, "learning_rate": 3.445205479452055e-05, "loss": 1.5943, "step": 908 }, { "epoch": 0.3112347528354376, "grad_norm": 0.1643984615802765, "learning_rate": 3.443493150684932e-05, "loss": 1.6432, "step": 909 }, { "epoch": 0.31157714530280334, "grad_norm": 0.17826248705387115, "learning_rate": 3.4417808219178086e-05, "loss": 1.644, "step": 910 }, { "epoch": 0.31191953777016906, "grad_norm": 0.1765701174736023, "learning_rate": 3.440068493150685e-05, "loss": 1.5619, "step": 911 }, { "epoch": 0.3122619302375348, "grad_norm": 0.17043675482273102, "learning_rate": 3.438356164383562e-05, "loss": 1.6581, "step": 912 }, { "epoch": 0.3126043227049005, "grad_norm": 0.16664226353168488, "learning_rate": 3.4366438356164385e-05, "loss": 1.5752, "step": 913 }, { "epoch": 0.3129467151722662, "grad_norm": 0.16119621694087982, "learning_rate": 3.434931506849315e-05, "loss": 1.6952, "step": 914 }, { "epoch": 0.3132891076396319, "grad_norm": 0.166438490152359, "learning_rate": 3.433219178082192e-05, "loss": 1.5508, "step": 915 }, { "epoch": 0.31363150010699764, "grad_norm": 0.15950678288936615, "learning_rate": 3.4315068493150685e-05, "loss": 1.6031, "step": 916 }, { "epoch": 0.31397389257436337, "grad_norm": 0.16010546684265137, "learning_rate": 3.429794520547945e-05, "loss": 1.6059, "step": 917 }, { "epoch": 0.3143162850417291, "grad_norm": 0.15263736248016357, "learning_rate": 3.428082191780822e-05, "loss": 1.6068, "step": 918 }, { "epoch": 0.3146586775090948, "grad_norm": 0.1636127531528473, "learning_rate": 3.4263698630136984e-05, "loss": 1.5813, "step": 919 }, { "epoch": 0.3150010699764605, "grad_norm": 0.17480973899364471, "learning_rate": 3.424657534246575e-05, "loss": 1.6271, "step": 920 }, { "epoch": 0.3153434624438262, "grad_norm": 0.1749420017004013, "learning_rate": 3.422945205479452e-05, "loss": 1.6368, "step": 921 }, { "epoch": 0.31568585491119194, "grad_norm": 0.17961269617080688, "learning_rate": 3.421232876712329e-05, "loss": 1.647, "step": 922 }, { "epoch": 0.31602824737855767, "grad_norm": 0.195336252450943, "learning_rate": 3.419520547945206e-05, "loss": 1.6588, "step": 923 }, { "epoch": 0.3163706398459234, "grad_norm": 0.16071833670139313, "learning_rate": 3.4178082191780824e-05, "loss": 1.5992, "step": 924 }, { "epoch": 0.3167130323132891, "grad_norm": 0.1687297374010086, "learning_rate": 3.416095890410959e-05, "loss": 1.5977, "step": 925 }, { "epoch": 0.31705542478065485, "grad_norm": 0.17823044955730438, "learning_rate": 3.414383561643836e-05, "loss": 1.604, "step": 926 }, { "epoch": 0.3173978172480205, "grad_norm": 0.1938398778438568, "learning_rate": 3.4126712328767123e-05, "loss": 1.6476, "step": 927 }, { "epoch": 0.31774020971538625, "grad_norm": 0.17029941082000732, "learning_rate": 3.410958904109589e-05, "loss": 1.6591, "step": 928 }, { "epoch": 0.318082602182752, "grad_norm": 0.18597517907619476, "learning_rate": 3.4092465753424656e-05, "loss": 1.5488, "step": 929 }, { "epoch": 0.3184249946501177, "grad_norm": 0.18103069067001343, "learning_rate": 3.407534246575342e-05, "loss": 1.6785, "step": 930 }, { "epoch": 0.3187673871174834, "grad_norm": 0.16627401113510132, "learning_rate": 3.4058219178082196e-05, "loss": 1.6454, "step": 931 }, { "epoch": 0.31910977958484915, "grad_norm": 0.18410666286945343, "learning_rate": 3.404109589041096e-05, "loss": 1.5841, "step": 932 }, { "epoch": 0.3194521720522149, "grad_norm": 0.1871345490217209, "learning_rate": 3.402397260273973e-05, "loss": 1.6676, "step": 933 }, { "epoch": 0.31979456451958055, "grad_norm": 0.171336367726326, "learning_rate": 3.4006849315068496e-05, "loss": 1.5697, "step": 934 }, { "epoch": 0.3201369569869463, "grad_norm": 0.17904919385910034, "learning_rate": 3.398972602739726e-05, "loss": 1.5471, "step": 935 }, { "epoch": 0.320479349454312, "grad_norm": 0.19725586473941803, "learning_rate": 3.397260273972603e-05, "loss": 1.6482, "step": 936 }, { "epoch": 0.3208217419216777, "grad_norm": 0.1848667562007904, "learning_rate": 3.3955479452054795e-05, "loss": 1.6163, "step": 937 }, { "epoch": 0.32116413438904345, "grad_norm": 0.17741495370864868, "learning_rate": 3.393835616438356e-05, "loss": 1.6684, "step": 938 }, { "epoch": 0.3215065268564092, "grad_norm": 0.15760120749473572, "learning_rate": 3.392123287671233e-05, "loss": 1.6255, "step": 939 }, { "epoch": 0.32184891932377485, "grad_norm": 0.16017502546310425, "learning_rate": 3.39041095890411e-05, "loss": 1.5483, "step": 940 }, { "epoch": 0.3221913117911406, "grad_norm": 0.17628340423107147, "learning_rate": 3.388698630136987e-05, "loss": 1.7526, "step": 941 }, { "epoch": 0.3225337042585063, "grad_norm": 0.1638764888048172, "learning_rate": 3.3869863013698635e-05, "loss": 1.577, "step": 942 }, { "epoch": 0.32287609672587203, "grad_norm": 0.1960722804069519, "learning_rate": 3.38527397260274e-05, "loss": 1.7142, "step": 943 }, { "epoch": 0.32321848919323776, "grad_norm": 0.1729043871164322, "learning_rate": 3.383561643835617e-05, "loss": 1.7376, "step": 944 }, { "epoch": 0.3235608816606035, "grad_norm": 0.17143891751766205, "learning_rate": 3.3818493150684934e-05, "loss": 1.6186, "step": 945 }, { "epoch": 0.3239032741279692, "grad_norm": 0.1813240945339203, "learning_rate": 3.38013698630137e-05, "loss": 1.6501, "step": 946 }, { "epoch": 0.3242456665953349, "grad_norm": 0.17511692643165588, "learning_rate": 3.378424657534247e-05, "loss": 1.5789, "step": 947 }, { "epoch": 0.3245880590627006, "grad_norm": 0.19227832555770874, "learning_rate": 3.3767123287671234e-05, "loss": 1.658, "step": 948 }, { "epoch": 0.32493045153006633, "grad_norm": 0.17649643123149872, "learning_rate": 3.375000000000001e-05, "loss": 1.6257, "step": 949 }, { "epoch": 0.32527284399743206, "grad_norm": 0.1958049237728119, "learning_rate": 3.373287671232877e-05, "loss": 1.6928, "step": 950 }, { "epoch": 0.3256152364647978, "grad_norm": 0.17958573997020721, "learning_rate": 3.3715753424657533e-05, "loss": 1.7384, "step": 951 }, { "epoch": 0.3259576289321635, "grad_norm": 0.1755129098892212, "learning_rate": 3.36986301369863e-05, "loss": 1.5691, "step": 952 }, { "epoch": 0.3263000213995292, "grad_norm": 0.17765860259532928, "learning_rate": 3.3681506849315067e-05, "loss": 1.6033, "step": 953 }, { "epoch": 0.3266424138668949, "grad_norm": 0.176289364695549, "learning_rate": 3.366438356164383e-05, "loss": 1.5646, "step": 954 }, { "epoch": 0.32698480633426064, "grad_norm": 0.17729108035564423, "learning_rate": 3.36472602739726e-05, "loss": 1.581, "step": 955 }, { "epoch": 0.32732719880162636, "grad_norm": 0.16288909316062927, "learning_rate": 3.3630136986301366e-05, "loss": 1.6407, "step": 956 }, { "epoch": 0.3276695912689921, "grad_norm": 0.20803111791610718, "learning_rate": 3.361301369863013e-05, "loss": 1.6333, "step": 957 }, { "epoch": 0.3280119837363578, "grad_norm": 0.17950700223445892, "learning_rate": 3.3595890410958906e-05, "loss": 1.5612, "step": 958 }, { "epoch": 0.32835437620372354, "grad_norm": 0.1893889605998993, "learning_rate": 3.357876712328767e-05, "loss": 1.6719, "step": 959 }, { "epoch": 0.3286967686710892, "grad_norm": 0.1749144345521927, "learning_rate": 3.356164383561644e-05, "loss": 1.6305, "step": 960 }, { "epoch": 0.32903916113845494, "grad_norm": 0.1672433763742447, "learning_rate": 3.3544520547945205e-05, "loss": 1.5187, "step": 961 }, { "epoch": 0.32938155360582066, "grad_norm": 0.17233562469482422, "learning_rate": 3.352739726027397e-05, "loss": 1.6141, "step": 962 }, { "epoch": 0.3297239460731864, "grad_norm": 0.17582817375659943, "learning_rate": 3.351027397260274e-05, "loss": 1.6457, "step": 963 }, { "epoch": 0.3300663385405521, "grad_norm": 0.17763985693454742, "learning_rate": 3.3493150684931505e-05, "loss": 1.5629, "step": 964 }, { "epoch": 0.33040873100791784, "grad_norm": 0.17102843523025513, "learning_rate": 3.347602739726027e-05, "loss": 1.6601, "step": 965 }, { "epoch": 0.33075112347528357, "grad_norm": 0.1864311844110489, "learning_rate": 3.3458904109589045e-05, "loss": 1.6802, "step": 966 }, { "epoch": 0.33109351594264924, "grad_norm": 0.1765185445547104, "learning_rate": 3.344178082191781e-05, "loss": 1.6723, "step": 967 }, { "epoch": 0.33143590841001497, "grad_norm": 0.18400153517723083, "learning_rate": 3.342465753424658e-05, "loss": 1.7019, "step": 968 }, { "epoch": 0.3317783008773807, "grad_norm": 0.1878415197134018, "learning_rate": 3.3407534246575344e-05, "loss": 1.6257, "step": 969 }, { "epoch": 0.3321206933447464, "grad_norm": 0.18473023176193237, "learning_rate": 3.339041095890411e-05, "loss": 1.6455, "step": 970 }, { "epoch": 0.33246308581211215, "grad_norm": 0.17929363250732422, "learning_rate": 3.337328767123288e-05, "loss": 1.6268, "step": 971 }, { "epoch": 0.3328054782794779, "grad_norm": 0.1841028779745102, "learning_rate": 3.3356164383561644e-05, "loss": 1.6492, "step": 972 }, { "epoch": 0.33314787074684354, "grad_norm": 0.18151289224624634, "learning_rate": 3.333904109589041e-05, "loss": 1.6191, "step": 973 }, { "epoch": 0.33349026321420927, "grad_norm": 0.19639885425567627, "learning_rate": 3.332191780821918e-05, "loss": 1.5356, "step": 974 }, { "epoch": 0.333832655681575, "grad_norm": 0.17961947619915009, "learning_rate": 3.330479452054795e-05, "loss": 1.5693, "step": 975 }, { "epoch": 0.3341750481489407, "grad_norm": 0.16966471076011658, "learning_rate": 3.328767123287672e-05, "loss": 1.5991, "step": 976 }, { "epoch": 0.33451744061630645, "grad_norm": 0.20656125247478485, "learning_rate": 3.327054794520548e-05, "loss": 1.6618, "step": 977 }, { "epoch": 0.3348598330836722, "grad_norm": 0.22313959896564484, "learning_rate": 3.325342465753425e-05, "loss": 1.6393, "step": 978 }, { "epoch": 0.3352022255510379, "grad_norm": 0.18152087926864624, "learning_rate": 3.3236301369863016e-05, "loss": 1.5673, "step": 979 }, { "epoch": 0.3355446180184036, "grad_norm": 0.17333132028579712, "learning_rate": 3.321917808219178e-05, "loss": 1.5818, "step": 980 }, { "epoch": 0.3358870104857693, "grad_norm": 0.19985678791999817, "learning_rate": 3.320205479452055e-05, "loss": 1.6588, "step": 981 }, { "epoch": 0.336229402953135, "grad_norm": 0.18973921239376068, "learning_rate": 3.3184931506849316e-05, "loss": 1.6705, "step": 982 }, { "epoch": 0.33657179542050075, "grad_norm": 0.1923156976699829, "learning_rate": 3.316780821917808e-05, "loss": 1.6799, "step": 983 }, { "epoch": 0.3369141878878665, "grad_norm": 0.17295393347740173, "learning_rate": 3.3150684931506856e-05, "loss": 1.5427, "step": 984 }, { "epoch": 0.3372565803552322, "grad_norm": 0.18471597135066986, "learning_rate": 3.313356164383562e-05, "loss": 1.6698, "step": 985 }, { "epoch": 0.3375989728225979, "grad_norm": 0.1872127503156662, "learning_rate": 3.311643835616439e-05, "loss": 1.6712, "step": 986 }, { "epoch": 0.3379413652899636, "grad_norm": 0.19074855744838715, "learning_rate": 3.3099315068493155e-05, "loss": 1.5375, "step": 987 }, { "epoch": 0.33828375775732933, "grad_norm": 0.19393405318260193, "learning_rate": 3.308219178082192e-05, "loss": 1.723, "step": 988 }, { "epoch": 0.33862615022469505, "grad_norm": 0.15910933911800385, "learning_rate": 3.306506849315069e-05, "loss": 1.5818, "step": 989 }, { "epoch": 0.3389685426920608, "grad_norm": 0.18083134293556213, "learning_rate": 3.3047945205479455e-05, "loss": 1.666, "step": 990 }, { "epoch": 0.3393109351594265, "grad_norm": 0.17913112044334412, "learning_rate": 3.303082191780822e-05, "loss": 1.5627, "step": 991 }, { "epoch": 0.33965332762679223, "grad_norm": 0.2048608511686325, "learning_rate": 3.301369863013699e-05, "loss": 1.6692, "step": 992 }, { "epoch": 0.3399957200941579, "grad_norm": 0.19195561110973358, "learning_rate": 3.2996575342465754e-05, "loss": 1.6032, "step": 993 }, { "epoch": 0.34033811256152363, "grad_norm": 0.1805218756198883, "learning_rate": 3.297945205479452e-05, "loss": 1.6804, "step": 994 }, { "epoch": 0.34068050502888936, "grad_norm": 0.1885395646095276, "learning_rate": 3.296232876712329e-05, "loss": 1.7198, "step": 995 }, { "epoch": 0.3410228974962551, "grad_norm": 0.17789386212825775, "learning_rate": 3.2945205479452054e-05, "loss": 1.6576, "step": 996 }, { "epoch": 0.3413652899636208, "grad_norm": 0.17843040823936462, "learning_rate": 3.292808219178082e-05, "loss": 1.6418, "step": 997 }, { "epoch": 0.34170768243098654, "grad_norm": 0.17257437109947205, "learning_rate": 3.291095890410959e-05, "loss": 1.6192, "step": 998 }, { "epoch": 0.34205007489835226, "grad_norm": 0.20164641737937927, "learning_rate": 3.2893835616438354e-05, "loss": 1.6241, "step": 999 }, { "epoch": 0.34239246736571793, "grad_norm": 0.2016402930021286, "learning_rate": 3.287671232876712e-05, "loss": 1.6303, "step": 1000 }, { "epoch": 0.34273485983308366, "grad_norm": 0.18253904581069946, "learning_rate": 3.285958904109589e-05, "loss": 1.5431, "step": 1001 }, { "epoch": 0.3430772523004494, "grad_norm": 0.19140085577964783, "learning_rate": 3.284246575342466e-05, "loss": 1.6362, "step": 1002 }, { "epoch": 0.3434196447678151, "grad_norm": 0.18327048420906067, "learning_rate": 3.2825342465753426e-05, "loss": 1.59, "step": 1003 }, { "epoch": 0.34376203723518084, "grad_norm": 0.18790295720100403, "learning_rate": 3.280821917808219e-05, "loss": 1.6488, "step": 1004 }, { "epoch": 0.34410442970254657, "grad_norm": 0.2139243334531784, "learning_rate": 3.279109589041096e-05, "loss": 1.712, "step": 1005 }, { "epoch": 0.34444682216991224, "grad_norm": 0.17952506244182587, "learning_rate": 3.2773972602739726e-05, "loss": 1.6839, "step": 1006 }, { "epoch": 0.34478921463727796, "grad_norm": 0.1799176186323166, "learning_rate": 3.275684931506849e-05, "loss": 1.577, "step": 1007 }, { "epoch": 0.3451316071046437, "grad_norm": 0.18421360850334167, "learning_rate": 3.273972602739726e-05, "loss": 1.5576, "step": 1008 }, { "epoch": 0.3454739995720094, "grad_norm": 0.18500995635986328, "learning_rate": 3.2722602739726026e-05, "loss": 1.6503, "step": 1009 }, { "epoch": 0.34581639203937514, "grad_norm": 0.20635172724723816, "learning_rate": 3.270547945205479e-05, "loss": 1.603, "step": 1010 }, { "epoch": 0.34615878450674087, "grad_norm": 0.18448132276535034, "learning_rate": 3.2688356164383565e-05, "loss": 1.6423, "step": 1011 }, { "epoch": 0.3465011769741066, "grad_norm": 0.1843699961900711, "learning_rate": 3.267123287671233e-05, "loss": 1.6883, "step": 1012 }, { "epoch": 0.34684356944147227, "grad_norm": 0.20068658888339996, "learning_rate": 3.26541095890411e-05, "loss": 1.6445, "step": 1013 }, { "epoch": 0.347185961908838, "grad_norm": 0.1761769950389862, "learning_rate": 3.2636986301369865e-05, "loss": 1.654, "step": 1014 }, { "epoch": 0.3475283543762037, "grad_norm": 0.27954667806625366, "learning_rate": 3.261986301369863e-05, "loss": 1.5994, "step": 1015 }, { "epoch": 0.34787074684356944, "grad_norm": 0.1864766776561737, "learning_rate": 3.26027397260274e-05, "loss": 1.6855, "step": 1016 }, { "epoch": 0.34821313931093517, "grad_norm": 0.1818721443414688, "learning_rate": 3.2585616438356165e-05, "loss": 1.5951, "step": 1017 }, { "epoch": 0.3485555317783009, "grad_norm": 0.2109932154417038, "learning_rate": 3.256849315068493e-05, "loss": 1.7142, "step": 1018 }, { "epoch": 0.34889792424566657, "grad_norm": 0.1821468472480774, "learning_rate": 3.25513698630137e-05, "loss": 1.6201, "step": 1019 }, { "epoch": 0.3492403167130323, "grad_norm": 0.17200210690498352, "learning_rate": 3.253424657534247e-05, "loss": 1.6526, "step": 1020 }, { "epoch": 0.349582709180398, "grad_norm": 0.18340782821178436, "learning_rate": 3.251712328767124e-05, "loss": 1.5548, "step": 1021 }, { "epoch": 0.34992510164776375, "grad_norm": 0.1810046285390854, "learning_rate": 3.2500000000000004e-05, "loss": 1.5305, "step": 1022 }, { "epoch": 0.3502674941151295, "grad_norm": 0.1688162088394165, "learning_rate": 3.248287671232877e-05, "loss": 1.5816, "step": 1023 }, { "epoch": 0.3506098865824952, "grad_norm": 0.1735886186361313, "learning_rate": 3.246575342465754e-05, "loss": 1.5962, "step": 1024 }, { "epoch": 0.3509522790498609, "grad_norm": 0.18536169826984406, "learning_rate": 3.2448630136986303e-05, "loss": 1.6776, "step": 1025 }, { "epoch": 0.3512946715172266, "grad_norm": 0.19295240938663483, "learning_rate": 3.243150684931507e-05, "loss": 1.6122, "step": 1026 }, { "epoch": 0.3516370639845923, "grad_norm": 0.1920599341392517, "learning_rate": 3.2414383561643837e-05, "loss": 1.6262, "step": 1027 }, { "epoch": 0.35197945645195805, "grad_norm": 0.17934128642082214, "learning_rate": 3.23972602739726e-05, "loss": 1.6347, "step": 1028 }, { "epoch": 0.3523218489193238, "grad_norm": 0.17396144568920135, "learning_rate": 3.2380136986301376e-05, "loss": 1.6676, "step": 1029 }, { "epoch": 0.3526642413866895, "grad_norm": 0.1750507652759552, "learning_rate": 3.236301369863014e-05, "loss": 1.5825, "step": 1030 }, { "epoch": 0.35300663385405523, "grad_norm": 0.19876091182231903, "learning_rate": 3.234589041095891e-05, "loss": 1.6014, "step": 1031 }, { "epoch": 0.35334902632142096, "grad_norm": 0.18725259602069855, "learning_rate": 3.2328767123287676e-05, "loss": 1.6815, "step": 1032 }, { "epoch": 0.3536914187887866, "grad_norm": 0.18709374964237213, "learning_rate": 3.231164383561644e-05, "loss": 1.6898, "step": 1033 }, { "epoch": 0.35403381125615235, "grad_norm": 0.18304820358753204, "learning_rate": 3.229452054794521e-05, "loss": 1.5685, "step": 1034 }, { "epoch": 0.3543762037235181, "grad_norm": 0.186636820435524, "learning_rate": 3.2277397260273975e-05, "loss": 1.699, "step": 1035 }, { "epoch": 0.3547185961908838, "grad_norm": 0.17933103442192078, "learning_rate": 3.226027397260274e-05, "loss": 1.6093, "step": 1036 }, { "epoch": 0.35506098865824953, "grad_norm": 0.18076711893081665, "learning_rate": 3.224315068493151e-05, "loss": 1.6928, "step": 1037 }, { "epoch": 0.35540338112561526, "grad_norm": 0.18083003163337708, "learning_rate": 3.2226027397260275e-05, "loss": 1.6655, "step": 1038 }, { "epoch": 0.35574577359298093, "grad_norm": 0.21529877185821533, "learning_rate": 3.220890410958904e-05, "loss": 1.5586, "step": 1039 }, { "epoch": 0.35608816606034666, "grad_norm": 0.17650671303272247, "learning_rate": 3.219178082191781e-05, "loss": 1.5948, "step": 1040 }, { "epoch": 0.3564305585277124, "grad_norm": 0.170661062002182, "learning_rate": 3.2174657534246575e-05, "loss": 1.6147, "step": 1041 }, { "epoch": 0.3567729509950781, "grad_norm": 0.19031096994876862, "learning_rate": 3.215753424657534e-05, "loss": 1.6156, "step": 1042 }, { "epoch": 0.35711534346244383, "grad_norm": 0.1843324601650238, "learning_rate": 3.214041095890411e-05, "loss": 1.6474, "step": 1043 }, { "epoch": 0.35745773592980956, "grad_norm": 0.19299174845218658, "learning_rate": 3.2123287671232874e-05, "loss": 1.6692, "step": 1044 }, { "epoch": 0.3578001283971753, "grad_norm": 0.18297499418258667, "learning_rate": 3.210616438356164e-05, "loss": 1.6732, "step": 1045 }, { "epoch": 0.35814252086454096, "grad_norm": 0.18311627209186554, "learning_rate": 3.208904109589041e-05, "loss": 1.6295, "step": 1046 }, { "epoch": 0.3584849133319067, "grad_norm": 0.1834397315979004, "learning_rate": 3.207191780821918e-05, "loss": 1.5278, "step": 1047 }, { "epoch": 0.3588273057992724, "grad_norm": 0.1803441345691681, "learning_rate": 3.205479452054795e-05, "loss": 1.6537, "step": 1048 }, { "epoch": 0.35916969826663814, "grad_norm": 0.19654768705368042, "learning_rate": 3.2037671232876714e-05, "loss": 1.6192, "step": 1049 }, { "epoch": 0.35951209073400386, "grad_norm": 0.19090498983860016, "learning_rate": 3.202054794520548e-05, "loss": 1.5593, "step": 1050 }, { "epoch": 0.3598544832013696, "grad_norm": 0.19946853816509247, "learning_rate": 3.2003424657534247e-05, "loss": 1.6443, "step": 1051 }, { "epoch": 0.36019687566873526, "grad_norm": 0.1901293247938156, "learning_rate": 3.198630136986301e-05, "loss": 1.6306, "step": 1052 }, { "epoch": 0.360539268136101, "grad_norm": 0.17871630191802979, "learning_rate": 3.196917808219178e-05, "loss": 1.5884, "step": 1053 }, { "epoch": 0.3608816606034667, "grad_norm": 0.18092705309391022, "learning_rate": 3.1952054794520546e-05, "loss": 1.6805, "step": 1054 }, { "epoch": 0.36122405307083244, "grad_norm": 0.1813168078660965, "learning_rate": 3.193493150684932e-05, "loss": 1.6573, "step": 1055 }, { "epoch": 0.36156644553819817, "grad_norm": 0.17984431982040405, "learning_rate": 3.1917808219178086e-05, "loss": 1.6241, "step": 1056 }, { "epoch": 0.3619088380055639, "grad_norm": 0.25782960653305054, "learning_rate": 3.190068493150685e-05, "loss": 1.4726, "step": 1057 }, { "epoch": 0.3622512304729296, "grad_norm": 0.19268107414245605, "learning_rate": 3.188356164383562e-05, "loss": 1.571, "step": 1058 }, { "epoch": 0.3625936229402953, "grad_norm": 0.18220491707324982, "learning_rate": 3.1866438356164386e-05, "loss": 1.6121, "step": 1059 }, { "epoch": 0.362936015407661, "grad_norm": 0.18143941462039948, "learning_rate": 3.184931506849315e-05, "loss": 1.6104, "step": 1060 }, { "epoch": 0.36327840787502674, "grad_norm": 0.23851390182971954, "learning_rate": 3.183219178082192e-05, "loss": 1.5893, "step": 1061 }, { "epoch": 0.36362080034239247, "grad_norm": 0.18399028480052948, "learning_rate": 3.1815068493150685e-05, "loss": 1.6988, "step": 1062 }, { "epoch": 0.3639631928097582, "grad_norm": 0.18836073577404022, "learning_rate": 3.179794520547945e-05, "loss": 1.658, "step": 1063 }, { "epoch": 0.3643055852771239, "grad_norm": 0.17907658219337463, "learning_rate": 3.1780821917808225e-05, "loss": 1.5348, "step": 1064 }, { "epoch": 0.36464797774448965, "grad_norm": 0.19295437633991241, "learning_rate": 3.176369863013699e-05, "loss": 1.6377, "step": 1065 }, { "epoch": 0.3649903702118553, "grad_norm": 0.19009754061698914, "learning_rate": 3.174657534246576e-05, "loss": 1.606, "step": 1066 }, { "epoch": 0.36533276267922105, "grad_norm": 0.2041563093662262, "learning_rate": 3.1729452054794524e-05, "loss": 1.6393, "step": 1067 }, { "epoch": 0.36567515514658677, "grad_norm": 0.18799561262130737, "learning_rate": 3.171232876712329e-05, "loss": 1.6491, "step": 1068 }, { "epoch": 0.3660175476139525, "grad_norm": 0.20276644825935364, "learning_rate": 3.169520547945206e-05, "loss": 1.5538, "step": 1069 }, { "epoch": 0.3663599400813182, "grad_norm": 0.20255041122436523, "learning_rate": 3.1678082191780824e-05, "loss": 1.6593, "step": 1070 }, { "epoch": 0.36670233254868395, "grad_norm": 0.19274236261844635, "learning_rate": 3.166095890410959e-05, "loss": 1.677, "step": 1071 }, { "epoch": 0.3670447250160496, "grad_norm": 0.18202699720859528, "learning_rate": 3.164383561643836e-05, "loss": 1.5755, "step": 1072 }, { "epoch": 0.36738711748341535, "grad_norm": 0.19245032966136932, "learning_rate": 3.162671232876713e-05, "loss": 1.698, "step": 1073 }, { "epoch": 0.3677295099507811, "grad_norm": 0.19466449320316315, "learning_rate": 3.16095890410959e-05, "loss": 1.6998, "step": 1074 }, { "epoch": 0.3680719024181468, "grad_norm": 0.18528608977794647, "learning_rate": 3.1592465753424663e-05, "loss": 1.6303, "step": 1075 }, { "epoch": 0.3684142948855125, "grad_norm": 0.19063135981559753, "learning_rate": 3.157534246575343e-05, "loss": 1.4672, "step": 1076 }, { "epoch": 0.36875668735287825, "grad_norm": 0.1875833421945572, "learning_rate": 3.1558219178082196e-05, "loss": 1.6827, "step": 1077 }, { "epoch": 0.369099079820244, "grad_norm": 0.20154210925102234, "learning_rate": 3.154109589041096e-05, "loss": 1.5953, "step": 1078 }, { "epoch": 0.36944147228760965, "grad_norm": 0.1844032108783722, "learning_rate": 3.152397260273973e-05, "loss": 1.6036, "step": 1079 }, { "epoch": 0.3697838647549754, "grad_norm": 0.19859765470027924, "learning_rate": 3.1506849315068496e-05, "loss": 1.5341, "step": 1080 }, { "epoch": 0.3701262572223411, "grad_norm": 0.16925029456615448, "learning_rate": 3.148972602739726e-05, "loss": 1.5131, "step": 1081 }, { "epoch": 0.37046864968970683, "grad_norm": 0.19726942479610443, "learning_rate": 3.147260273972603e-05, "loss": 1.7153, "step": 1082 }, { "epoch": 0.37081104215707256, "grad_norm": 0.18619368970394135, "learning_rate": 3.1455479452054796e-05, "loss": 1.6423, "step": 1083 }, { "epoch": 0.3711534346244383, "grad_norm": 0.21111862361431122, "learning_rate": 3.143835616438356e-05, "loss": 1.6267, "step": 1084 }, { "epoch": 0.37149582709180395, "grad_norm": 0.20075955986976624, "learning_rate": 3.142123287671233e-05, "loss": 1.5509, "step": 1085 }, { "epoch": 0.3718382195591697, "grad_norm": 0.20781105756759644, "learning_rate": 3.1404109589041095e-05, "loss": 1.6052, "step": 1086 }, { "epoch": 0.3721806120265354, "grad_norm": 0.19642221927642822, "learning_rate": 3.138698630136986e-05, "loss": 1.5965, "step": 1087 }, { "epoch": 0.37252300449390113, "grad_norm": 0.19483277201652527, "learning_rate": 3.136986301369863e-05, "loss": 1.5923, "step": 1088 }, { "epoch": 0.37286539696126686, "grad_norm": 0.20478741824626923, "learning_rate": 3.1352739726027395e-05, "loss": 1.6218, "step": 1089 }, { "epoch": 0.3732077894286326, "grad_norm": 0.18183819949626923, "learning_rate": 3.133561643835616e-05, "loss": 1.5618, "step": 1090 }, { "epoch": 0.3735501818959983, "grad_norm": 0.17204350233078003, "learning_rate": 3.1318493150684935e-05, "loss": 1.6502, "step": 1091 }, { "epoch": 0.373892574363364, "grad_norm": 0.1908375769853592, "learning_rate": 3.13013698630137e-05, "loss": 1.7973, "step": 1092 }, { "epoch": 0.3742349668307297, "grad_norm": 0.18833976984024048, "learning_rate": 3.128424657534247e-05, "loss": 1.6183, "step": 1093 }, { "epoch": 0.37457735929809544, "grad_norm": 0.17543967068195343, "learning_rate": 3.1267123287671234e-05, "loss": 1.5735, "step": 1094 }, { "epoch": 0.37491975176546116, "grad_norm": 0.18235619366168976, "learning_rate": 3.125e-05, "loss": 1.7562, "step": 1095 }, { "epoch": 0.3752621442328269, "grad_norm": 0.1871318370103836, "learning_rate": 3.123287671232877e-05, "loss": 1.5954, "step": 1096 }, { "epoch": 0.3756045367001926, "grad_norm": 0.18809878826141357, "learning_rate": 3.1215753424657534e-05, "loss": 1.6308, "step": 1097 }, { "epoch": 0.37594692916755834, "grad_norm": 0.20508448779582977, "learning_rate": 3.11986301369863e-05, "loss": 1.6051, "step": 1098 }, { "epoch": 0.376289321634924, "grad_norm": 0.18957453966140747, "learning_rate": 3.118150684931507e-05, "loss": 1.5481, "step": 1099 }, { "epoch": 0.37663171410228974, "grad_norm": 0.20959049463272095, "learning_rate": 3.116438356164384e-05, "loss": 1.6624, "step": 1100 }, { "epoch": 0.37697410656965546, "grad_norm": 0.2008907049894333, "learning_rate": 3.1147260273972607e-05, "loss": 1.7189, "step": 1101 }, { "epoch": 0.3773164990370212, "grad_norm": 0.18902423977851868, "learning_rate": 3.113013698630137e-05, "loss": 1.6435, "step": 1102 }, { "epoch": 0.3776588915043869, "grad_norm": 0.19650417566299438, "learning_rate": 3.111301369863014e-05, "loss": 1.6979, "step": 1103 }, { "epoch": 0.37800128397175264, "grad_norm": 0.1878024786710739, "learning_rate": 3.1095890410958906e-05, "loss": 1.5169, "step": 1104 }, { "epoch": 0.3783436764391183, "grad_norm": 0.2638244330883026, "learning_rate": 3.107876712328767e-05, "loss": 1.4956, "step": 1105 }, { "epoch": 0.37868606890648404, "grad_norm": 0.2106718271970749, "learning_rate": 3.106164383561644e-05, "loss": 1.5367, "step": 1106 }, { "epoch": 0.37902846137384977, "grad_norm": 0.18417419493198395, "learning_rate": 3.1044520547945206e-05, "loss": 1.5806, "step": 1107 }, { "epoch": 0.3793708538412155, "grad_norm": 0.22065980732440948, "learning_rate": 3.102739726027397e-05, "loss": 1.6429, "step": 1108 }, { "epoch": 0.3797132463085812, "grad_norm": 0.19665010273456573, "learning_rate": 3.1010273972602746e-05, "loss": 1.6125, "step": 1109 }, { "epoch": 0.38005563877594695, "grad_norm": 0.186662957072258, "learning_rate": 3.099315068493151e-05, "loss": 1.6083, "step": 1110 }, { "epoch": 0.3803980312433127, "grad_norm": 0.1930997371673584, "learning_rate": 3.097602739726028e-05, "loss": 1.622, "step": 1111 }, { "epoch": 0.38074042371067834, "grad_norm": 0.19967059791088104, "learning_rate": 3.0958904109589045e-05, "loss": 1.6968, "step": 1112 }, { "epoch": 0.38108281617804407, "grad_norm": 0.20374490320682526, "learning_rate": 3.094178082191781e-05, "loss": 1.5956, "step": 1113 }, { "epoch": 0.3814252086454098, "grad_norm": 0.1924429088830948, "learning_rate": 3.092465753424658e-05, "loss": 1.6226, "step": 1114 }, { "epoch": 0.3817676011127755, "grad_norm": 0.1863481104373932, "learning_rate": 3.0907534246575345e-05, "loss": 1.5918, "step": 1115 }, { "epoch": 0.38210999358014125, "grad_norm": 0.1707417219877243, "learning_rate": 3.089041095890411e-05, "loss": 1.6965, "step": 1116 }, { "epoch": 0.382452386047507, "grad_norm": 0.19339831173419952, "learning_rate": 3.087328767123288e-05, "loss": 1.6087, "step": 1117 }, { "epoch": 0.38279477851487265, "grad_norm": 0.19181503355503082, "learning_rate": 3.085616438356165e-05, "loss": 1.614, "step": 1118 }, { "epoch": 0.3831371709822384, "grad_norm": 0.20983295142650604, "learning_rate": 3.083904109589042e-05, "loss": 1.5989, "step": 1119 }, { "epoch": 0.3834795634496041, "grad_norm": 0.18718980252742767, "learning_rate": 3.082191780821918e-05, "loss": 1.688, "step": 1120 }, { "epoch": 0.3838219559169698, "grad_norm": 0.18707607686519623, "learning_rate": 3.0804794520547944e-05, "loss": 1.6441, "step": 1121 }, { "epoch": 0.38416434838433555, "grad_norm": 0.19475042819976807, "learning_rate": 3.078767123287671e-05, "loss": 1.7437, "step": 1122 }, { "epoch": 0.3845067408517013, "grad_norm": 0.18442565202713013, "learning_rate": 3.077054794520548e-05, "loss": 1.6291, "step": 1123 }, { "epoch": 0.384849133319067, "grad_norm": 0.19626346230506897, "learning_rate": 3.075342465753424e-05, "loss": 1.6308, "step": 1124 }, { "epoch": 0.3851915257864327, "grad_norm": 0.17365996539592743, "learning_rate": 3.073630136986301e-05, "loss": 1.5347, "step": 1125 }, { "epoch": 0.3855339182537984, "grad_norm": 0.19015009701251984, "learning_rate": 3.0719178082191776e-05, "loss": 1.6134, "step": 1126 }, { "epoch": 0.38587631072116413, "grad_norm": 0.3407045602798462, "learning_rate": 3.070205479452055e-05, "loss": 1.6189, "step": 1127 }, { "epoch": 0.38621870318852985, "grad_norm": 0.2030169516801834, "learning_rate": 3.0684931506849316e-05, "loss": 1.5804, "step": 1128 }, { "epoch": 0.3865610956558956, "grad_norm": 0.21365243196487427, "learning_rate": 3.066780821917808e-05, "loss": 1.6086, "step": 1129 }, { "epoch": 0.3869034881232613, "grad_norm": 0.20101509988307953, "learning_rate": 3.065068493150685e-05, "loss": 1.6666, "step": 1130 }, { "epoch": 0.387245880590627, "grad_norm": 0.18813222646713257, "learning_rate": 3.0633561643835616e-05, "loss": 1.6311, "step": 1131 }, { "epoch": 0.3875882730579927, "grad_norm": 0.19953742623329163, "learning_rate": 3.061643835616438e-05, "loss": 1.5415, "step": 1132 }, { "epoch": 0.38793066552535843, "grad_norm": 0.21112066507339478, "learning_rate": 3.059931506849315e-05, "loss": 1.6401, "step": 1133 }, { "epoch": 0.38827305799272416, "grad_norm": 0.18418999016284943, "learning_rate": 3.0582191780821915e-05, "loss": 1.5794, "step": 1134 }, { "epoch": 0.3886154504600899, "grad_norm": 0.23663243651390076, "learning_rate": 3.056506849315068e-05, "loss": 1.5451, "step": 1135 }, { "epoch": 0.3889578429274556, "grad_norm": 0.19836454093456268, "learning_rate": 3.0547945205479455e-05, "loss": 1.5687, "step": 1136 }, { "epoch": 0.38930023539482134, "grad_norm": 0.19420097768306732, "learning_rate": 3.053082191780822e-05, "loss": 1.5756, "step": 1137 }, { "epoch": 0.389642627862187, "grad_norm": 0.18941549956798553, "learning_rate": 3.0513698630136988e-05, "loss": 1.6254, "step": 1138 }, { "epoch": 0.38998502032955273, "grad_norm": 0.20403435826301575, "learning_rate": 3.0496575342465755e-05, "loss": 1.6196, "step": 1139 }, { "epoch": 0.39032741279691846, "grad_norm": 0.19868136942386627, "learning_rate": 3.047945205479452e-05, "loss": 1.7546, "step": 1140 }, { "epoch": 0.3906698052642842, "grad_norm": 0.19127783179283142, "learning_rate": 3.0462328767123288e-05, "loss": 1.6221, "step": 1141 }, { "epoch": 0.3910121977316499, "grad_norm": 0.18487536907196045, "learning_rate": 3.0445205479452054e-05, "loss": 1.6229, "step": 1142 }, { "epoch": 0.39135459019901564, "grad_norm": 0.2067660689353943, "learning_rate": 3.042808219178082e-05, "loss": 1.5792, "step": 1143 }, { "epoch": 0.39169698266638137, "grad_norm": 0.19866935908794403, "learning_rate": 3.0410958904109594e-05, "loss": 1.5984, "step": 1144 }, { "epoch": 0.39203937513374704, "grad_norm": 0.1991511583328247, "learning_rate": 3.039383561643836e-05, "loss": 1.5903, "step": 1145 }, { "epoch": 0.39238176760111276, "grad_norm": 0.2296113818883896, "learning_rate": 3.0376712328767127e-05, "loss": 1.609, "step": 1146 }, { "epoch": 0.3927241600684785, "grad_norm": 0.2328857034444809, "learning_rate": 3.0359589041095894e-05, "loss": 1.6551, "step": 1147 }, { "epoch": 0.3930665525358442, "grad_norm": 0.19710813462734222, "learning_rate": 3.034246575342466e-05, "loss": 1.5197, "step": 1148 }, { "epoch": 0.39340894500320994, "grad_norm": 0.22281409800052643, "learning_rate": 3.0325342465753427e-05, "loss": 1.4986, "step": 1149 }, { "epoch": 0.39375133747057567, "grad_norm": 0.20117011666297913, "learning_rate": 3.0308219178082193e-05, "loss": 1.4741, "step": 1150 }, { "epoch": 0.39409372993794134, "grad_norm": 0.2047719806432724, "learning_rate": 3.029109589041096e-05, "loss": 1.7159, "step": 1151 }, { "epoch": 0.39443612240530707, "grad_norm": 0.1928446739912033, "learning_rate": 3.0273972602739726e-05, "loss": 1.5493, "step": 1152 }, { "epoch": 0.3947785148726728, "grad_norm": 0.22128382325172424, "learning_rate": 3.0256849315068496e-05, "loss": 1.5613, "step": 1153 }, { "epoch": 0.3951209073400385, "grad_norm": 0.20296743512153625, "learning_rate": 3.0239726027397263e-05, "loss": 1.5346, "step": 1154 }, { "epoch": 0.39546329980740424, "grad_norm": 0.33349767327308655, "learning_rate": 3.022260273972603e-05, "loss": 1.5463, "step": 1155 }, { "epoch": 0.39580569227476997, "grad_norm": 0.18981929123401642, "learning_rate": 3.0205479452054796e-05, "loss": 1.5849, "step": 1156 }, { "epoch": 0.3961480847421357, "grad_norm": 0.1948363184928894, "learning_rate": 3.0188356164383562e-05, "loss": 1.6441, "step": 1157 }, { "epoch": 0.39649047720950137, "grad_norm": 0.17991802096366882, "learning_rate": 3.017123287671233e-05, "loss": 1.6123, "step": 1158 }, { "epoch": 0.3968328696768671, "grad_norm": 0.1889996975660324, "learning_rate": 3.0154109589041095e-05, "loss": 1.6949, "step": 1159 }, { "epoch": 0.3971752621442328, "grad_norm": 0.20630910992622375, "learning_rate": 3.0136986301369862e-05, "loss": 1.6172, "step": 1160 }, { "epoch": 0.39751765461159855, "grad_norm": 0.21217727661132812, "learning_rate": 3.011986301369863e-05, "loss": 1.6122, "step": 1161 }, { "epoch": 0.3978600470789643, "grad_norm": 0.1880672723054886, "learning_rate": 3.01027397260274e-05, "loss": 1.6477, "step": 1162 }, { "epoch": 0.39820243954633, "grad_norm": 0.20123882591724396, "learning_rate": 3.0085616438356168e-05, "loss": 1.6769, "step": 1163 }, { "epoch": 0.39854483201369567, "grad_norm": 0.19149883091449738, "learning_rate": 3.0068493150684935e-05, "loss": 1.6584, "step": 1164 }, { "epoch": 0.3988872244810614, "grad_norm": 0.19258956611156464, "learning_rate": 3.00513698630137e-05, "loss": 1.6294, "step": 1165 }, { "epoch": 0.3992296169484271, "grad_norm": 0.19530992209911346, "learning_rate": 3.0034246575342468e-05, "loss": 1.633, "step": 1166 }, { "epoch": 0.39957200941579285, "grad_norm": 0.19439341127872467, "learning_rate": 3.0017123287671234e-05, "loss": 1.7151, "step": 1167 }, { "epoch": 0.3999144018831586, "grad_norm": 0.18661381304264069, "learning_rate": 3e-05, "loss": 1.6459, "step": 1168 }, { "epoch": 0.4002567943505243, "grad_norm": 0.21022963523864746, "learning_rate": 2.9982876712328767e-05, "loss": 1.5486, "step": 1169 }, { "epoch": 0.40059918681789003, "grad_norm": 0.1958705335855484, "learning_rate": 2.9965753424657534e-05, "loss": 1.5925, "step": 1170 }, { "epoch": 0.4009415792852557, "grad_norm": 0.20628204941749573, "learning_rate": 2.9948630136986304e-05, "loss": 1.5819, "step": 1171 }, { "epoch": 0.4012839717526214, "grad_norm": 0.2144192010164261, "learning_rate": 2.993150684931507e-05, "loss": 1.6292, "step": 1172 }, { "epoch": 0.40162636421998715, "grad_norm": 0.21711930632591248, "learning_rate": 2.9914383561643837e-05, "loss": 1.6172, "step": 1173 }, { "epoch": 0.4019687566873529, "grad_norm": 0.1997271180152893, "learning_rate": 2.9897260273972603e-05, "loss": 1.6054, "step": 1174 }, { "epoch": 0.4023111491547186, "grad_norm": 0.23747354745864868, "learning_rate": 2.988013698630137e-05, "loss": 1.5894, "step": 1175 }, { "epoch": 0.40265354162208433, "grad_norm": 0.1878305971622467, "learning_rate": 2.9863013698630136e-05, "loss": 1.5381, "step": 1176 }, { "epoch": 0.40299593408945006, "grad_norm": 0.24648581445217133, "learning_rate": 2.9845890410958903e-05, "loss": 1.5133, "step": 1177 }, { "epoch": 0.40333832655681573, "grad_norm": 0.21232253313064575, "learning_rate": 2.982876712328767e-05, "loss": 1.6457, "step": 1178 }, { "epoch": 0.40368071902418146, "grad_norm": 0.2056000530719757, "learning_rate": 2.9811643835616436e-05, "loss": 1.6468, "step": 1179 }, { "epoch": 0.4040231114915472, "grad_norm": 0.19832269847393036, "learning_rate": 2.979452054794521e-05, "loss": 1.683, "step": 1180 }, { "epoch": 0.4043655039589129, "grad_norm": 0.18867991864681244, "learning_rate": 2.9777397260273976e-05, "loss": 1.6066, "step": 1181 }, { "epoch": 0.40470789642627863, "grad_norm": 0.18770624697208405, "learning_rate": 2.9760273972602742e-05, "loss": 1.5425, "step": 1182 }, { "epoch": 0.40505028889364436, "grad_norm": 0.20698951184749603, "learning_rate": 2.974315068493151e-05, "loss": 1.6312, "step": 1183 }, { "epoch": 0.40539268136101003, "grad_norm": 0.1922239363193512, "learning_rate": 2.9726027397260275e-05, "loss": 1.5781, "step": 1184 }, { "epoch": 0.40573507382837576, "grad_norm": 0.24619324505329132, "learning_rate": 2.9708904109589042e-05, "loss": 1.4741, "step": 1185 }, { "epoch": 0.4060774662957415, "grad_norm": 0.1945994347333908, "learning_rate": 2.969178082191781e-05, "loss": 1.6258, "step": 1186 }, { "epoch": 0.4064198587631072, "grad_norm": 0.22887203097343445, "learning_rate": 2.9674657534246575e-05, "loss": 1.4703, "step": 1187 }, { "epoch": 0.40676225123047294, "grad_norm": 0.19976423680782318, "learning_rate": 2.965753424657534e-05, "loss": 1.5213, "step": 1188 }, { "epoch": 0.40710464369783866, "grad_norm": 0.20264874398708344, "learning_rate": 2.9640410958904115e-05, "loss": 1.6918, "step": 1189 }, { "epoch": 0.4074470361652044, "grad_norm": 0.19586379826068878, "learning_rate": 2.962328767123288e-05, "loss": 1.6202, "step": 1190 }, { "epoch": 0.40778942863257006, "grad_norm": 0.20619496703147888, "learning_rate": 2.9606164383561648e-05, "loss": 1.6371, "step": 1191 }, { "epoch": 0.4081318210999358, "grad_norm": 0.19365999102592468, "learning_rate": 2.9589041095890414e-05, "loss": 1.5495, "step": 1192 }, { "epoch": 0.4084742135673015, "grad_norm": 0.21438881754875183, "learning_rate": 2.957191780821918e-05, "loss": 1.6266, "step": 1193 }, { "epoch": 0.40881660603466724, "grad_norm": 0.2016000598669052, "learning_rate": 2.9554794520547947e-05, "loss": 1.6854, "step": 1194 }, { "epoch": 0.40915899850203297, "grad_norm": 0.21889112889766693, "learning_rate": 2.9537671232876714e-05, "loss": 1.6176, "step": 1195 }, { "epoch": 0.4095013909693987, "grad_norm": 0.1996287703514099, "learning_rate": 2.952054794520548e-05, "loss": 1.7375, "step": 1196 }, { "epoch": 0.40984378343676436, "grad_norm": 0.20426683127880096, "learning_rate": 2.9503424657534247e-05, "loss": 1.61, "step": 1197 }, { "epoch": 0.4101861759041301, "grad_norm": 0.19195371866226196, "learning_rate": 2.9486301369863017e-05, "loss": 1.5675, "step": 1198 }, { "epoch": 0.4105285683714958, "grad_norm": 0.18590030074119568, "learning_rate": 2.9469178082191783e-05, "loss": 1.6981, "step": 1199 }, { "epoch": 0.41087096083886154, "grad_norm": 0.1899462342262268, "learning_rate": 2.945205479452055e-05, "loss": 1.6254, "step": 1200 }, { "epoch": 0.41121335330622727, "grad_norm": 0.19517403841018677, "learning_rate": 2.9434931506849316e-05, "loss": 1.543, "step": 1201 }, { "epoch": 0.411555745773593, "grad_norm": 0.21083654463291168, "learning_rate": 2.9417808219178083e-05, "loss": 1.7008, "step": 1202 }, { "epoch": 0.4118981382409587, "grad_norm": 0.18863758444786072, "learning_rate": 2.940068493150685e-05, "loss": 1.6798, "step": 1203 }, { "epoch": 0.4122405307083244, "grad_norm": 0.18859133124351501, "learning_rate": 2.9383561643835616e-05, "loss": 1.591, "step": 1204 }, { "epoch": 0.4125829231756901, "grad_norm": 0.21327324211597443, "learning_rate": 2.9366438356164382e-05, "loss": 1.6546, "step": 1205 }, { "epoch": 0.41292531564305585, "grad_norm": 0.19365008175373077, "learning_rate": 2.934931506849315e-05, "loss": 1.6675, "step": 1206 }, { "epoch": 0.41326770811042157, "grad_norm": 0.18357425928115845, "learning_rate": 2.9332191780821922e-05, "loss": 1.6258, "step": 1207 }, { "epoch": 0.4136101005777873, "grad_norm": 0.18261410295963287, "learning_rate": 2.931506849315069e-05, "loss": 1.5909, "step": 1208 }, { "epoch": 0.413952493045153, "grad_norm": 0.18606147170066833, "learning_rate": 2.9297945205479455e-05, "loss": 1.5931, "step": 1209 }, { "epoch": 0.41429488551251875, "grad_norm": 0.24596050381660461, "learning_rate": 2.9280821917808222e-05, "loss": 1.5936, "step": 1210 }, { "epoch": 0.4146372779798844, "grad_norm": 0.1821003258228302, "learning_rate": 2.9263698630136988e-05, "loss": 1.6097, "step": 1211 }, { "epoch": 0.41497967044725015, "grad_norm": 0.20710545778274536, "learning_rate": 2.9246575342465755e-05, "loss": 1.5249, "step": 1212 }, { "epoch": 0.4153220629146159, "grad_norm": 0.20131874084472656, "learning_rate": 2.922945205479452e-05, "loss": 1.667, "step": 1213 }, { "epoch": 0.4156644553819816, "grad_norm": 0.19862274825572968, "learning_rate": 2.9212328767123288e-05, "loss": 1.6414, "step": 1214 }, { "epoch": 0.4160068478493473, "grad_norm": 0.19424869120121002, "learning_rate": 2.9195205479452054e-05, "loss": 1.464, "step": 1215 }, { "epoch": 0.41634924031671305, "grad_norm": 0.2117459923028946, "learning_rate": 2.9178082191780824e-05, "loss": 1.6906, "step": 1216 }, { "epoch": 0.4166916327840787, "grad_norm": 0.22193311154842377, "learning_rate": 2.916095890410959e-05, "loss": 1.6408, "step": 1217 }, { "epoch": 0.41703402525144445, "grad_norm": 0.1936057060956955, "learning_rate": 2.9143835616438357e-05, "loss": 1.5219, "step": 1218 }, { "epoch": 0.4173764177188102, "grad_norm": 0.21189062297344208, "learning_rate": 2.9126712328767124e-05, "loss": 1.7595, "step": 1219 }, { "epoch": 0.4177188101861759, "grad_norm": 0.20430253446102142, "learning_rate": 2.910958904109589e-05, "loss": 1.5661, "step": 1220 }, { "epoch": 0.41806120265354163, "grad_norm": 0.24200312793254852, "learning_rate": 2.9092465753424657e-05, "loss": 1.638, "step": 1221 }, { "epoch": 0.41840359512090736, "grad_norm": 0.19700270891189575, "learning_rate": 2.9075342465753423e-05, "loss": 1.5988, "step": 1222 }, { "epoch": 0.4187459875882731, "grad_norm": 0.1806430071592331, "learning_rate": 2.905821917808219e-05, "loss": 1.5989, "step": 1223 }, { "epoch": 0.41908838005563875, "grad_norm": 0.205569788813591, "learning_rate": 2.9041095890410956e-05, "loss": 1.5392, "step": 1224 }, { "epoch": 0.4194307725230045, "grad_norm": 0.218040332198143, "learning_rate": 2.902397260273973e-05, "loss": 1.6328, "step": 1225 }, { "epoch": 0.4197731649903702, "grad_norm": 0.218037948012352, "learning_rate": 2.9006849315068496e-05, "loss": 1.7661, "step": 1226 }, { "epoch": 0.42011555745773593, "grad_norm": 0.2008579522371292, "learning_rate": 2.8989726027397263e-05, "loss": 1.5466, "step": 1227 }, { "epoch": 0.42045794992510166, "grad_norm": 0.22164225578308105, "learning_rate": 2.897260273972603e-05, "loss": 1.6901, "step": 1228 }, { "epoch": 0.4208003423924674, "grad_norm": 0.2010180950164795, "learning_rate": 2.8955479452054796e-05, "loss": 1.5809, "step": 1229 }, { "epoch": 0.42114273485983306, "grad_norm": 0.1935199648141861, "learning_rate": 2.8938356164383562e-05, "loss": 1.5926, "step": 1230 }, { "epoch": 0.4214851273271988, "grad_norm": 0.2031090408563614, "learning_rate": 2.892123287671233e-05, "loss": 1.6638, "step": 1231 }, { "epoch": 0.4218275197945645, "grad_norm": 0.19778242707252502, "learning_rate": 2.8904109589041095e-05, "loss": 1.5721, "step": 1232 }, { "epoch": 0.42216991226193024, "grad_norm": 0.21371868252754211, "learning_rate": 2.888698630136987e-05, "loss": 1.7066, "step": 1233 }, { "epoch": 0.42251230472929596, "grad_norm": 0.1981639266014099, "learning_rate": 2.8869863013698635e-05, "loss": 1.6451, "step": 1234 }, { "epoch": 0.4228546971966617, "grad_norm": 0.21592877805233002, "learning_rate": 2.8852739726027402e-05, "loss": 1.6224, "step": 1235 }, { "epoch": 0.4231970896640274, "grad_norm": 0.21491016447544098, "learning_rate": 2.8835616438356168e-05, "loss": 1.727, "step": 1236 }, { "epoch": 0.4235394821313931, "grad_norm": 0.20304915308952332, "learning_rate": 2.8818493150684935e-05, "loss": 1.7166, "step": 1237 }, { "epoch": 0.4238818745987588, "grad_norm": 0.2080349624156952, "learning_rate": 2.88013698630137e-05, "loss": 1.6177, "step": 1238 }, { "epoch": 0.42422426706612454, "grad_norm": 0.21475818753242493, "learning_rate": 2.8784246575342468e-05, "loss": 1.6319, "step": 1239 }, { "epoch": 0.42456665953349026, "grad_norm": 0.21294182538986206, "learning_rate": 2.8767123287671234e-05, "loss": 1.5259, "step": 1240 }, { "epoch": 0.424909052000856, "grad_norm": 0.20510578155517578, "learning_rate": 2.8749999999999997e-05, "loss": 1.5822, "step": 1241 }, { "epoch": 0.4252514444682217, "grad_norm": 0.2012259066104889, "learning_rate": 2.873287671232877e-05, "loss": 1.5818, "step": 1242 }, { "epoch": 0.42559383693558744, "grad_norm": 0.21601875126361847, "learning_rate": 2.8715753424657537e-05, "loss": 1.6363, "step": 1243 }, { "epoch": 0.4259362294029531, "grad_norm": 0.25579193234443665, "learning_rate": 2.8698630136986304e-05, "loss": 1.4899, "step": 1244 }, { "epoch": 0.42627862187031884, "grad_norm": 0.18454208970069885, "learning_rate": 2.868150684931507e-05, "loss": 1.6805, "step": 1245 }, { "epoch": 0.42662101433768457, "grad_norm": 0.2155836820602417, "learning_rate": 2.8664383561643837e-05, "loss": 1.6799, "step": 1246 }, { "epoch": 0.4269634068050503, "grad_norm": 0.20795008540153503, "learning_rate": 2.8647260273972603e-05, "loss": 1.6269, "step": 1247 }, { "epoch": 0.427305799272416, "grad_norm": 0.1875452697277069, "learning_rate": 2.863013698630137e-05, "loss": 1.5845, "step": 1248 }, { "epoch": 0.42764819173978175, "grad_norm": 0.2264728546142578, "learning_rate": 2.8613013698630136e-05, "loss": 1.5834, "step": 1249 }, { "epoch": 0.4279905842071474, "grad_norm": 0.19062699377536774, "learning_rate": 2.8595890410958903e-05, "loss": 1.5565, "step": 1250 }, { "epoch": 0.42833297667451314, "grad_norm": 0.21915385127067566, "learning_rate": 2.8578767123287676e-05, "loss": 1.5627, "step": 1251 }, { "epoch": 0.42867536914187887, "grad_norm": 0.20307733118534088, "learning_rate": 2.8561643835616443e-05, "loss": 1.6375, "step": 1252 }, { "epoch": 0.4290177616092446, "grad_norm": 0.20299731194972992, "learning_rate": 2.854452054794521e-05, "loss": 1.6236, "step": 1253 }, { "epoch": 0.4293601540766103, "grad_norm": 0.20549346506595612, "learning_rate": 2.8527397260273976e-05, "loss": 1.6356, "step": 1254 }, { "epoch": 0.42970254654397605, "grad_norm": 0.19700296223163605, "learning_rate": 2.8510273972602742e-05, "loss": 1.554, "step": 1255 }, { "epoch": 0.4300449390113418, "grad_norm": 0.19100826978683472, "learning_rate": 2.849315068493151e-05, "loss": 1.5504, "step": 1256 }, { "epoch": 0.43038733147870745, "grad_norm": 0.2009735107421875, "learning_rate": 2.8476027397260275e-05, "loss": 1.5976, "step": 1257 }, { "epoch": 0.4307297239460732, "grad_norm": 0.19709287583827972, "learning_rate": 2.8458904109589042e-05, "loss": 1.6391, "step": 1258 }, { "epoch": 0.4310721164134389, "grad_norm": 0.2270474135875702, "learning_rate": 2.844178082191781e-05, "loss": 1.7286, "step": 1259 }, { "epoch": 0.4314145088808046, "grad_norm": 0.19109757244586945, "learning_rate": 2.842465753424658e-05, "loss": 1.6105, "step": 1260 }, { "epoch": 0.43175690134817035, "grad_norm": 0.19890645146369934, "learning_rate": 2.8407534246575345e-05, "loss": 1.6361, "step": 1261 }, { "epoch": 0.4320992938155361, "grad_norm": 0.20450876653194427, "learning_rate": 2.839041095890411e-05, "loss": 1.533, "step": 1262 }, { "epoch": 0.43244168628290175, "grad_norm": 0.20255088806152344, "learning_rate": 2.8373287671232878e-05, "loss": 1.6128, "step": 1263 }, { "epoch": 0.4327840787502675, "grad_norm": 0.2054801881313324, "learning_rate": 2.8356164383561644e-05, "loss": 1.5831, "step": 1264 }, { "epoch": 0.4331264712176332, "grad_norm": 0.2056148499250412, "learning_rate": 2.833904109589041e-05, "loss": 1.5512, "step": 1265 }, { "epoch": 0.43346886368499893, "grad_norm": 0.20309239625930786, "learning_rate": 2.8321917808219177e-05, "loss": 1.6616, "step": 1266 }, { "epoch": 0.43381125615236465, "grad_norm": 0.1966654509305954, "learning_rate": 2.8304794520547944e-05, "loss": 1.6604, "step": 1267 }, { "epoch": 0.4341536486197304, "grad_norm": 0.20637613534927368, "learning_rate": 2.828767123287671e-05, "loss": 1.7119, "step": 1268 }, { "epoch": 0.4344960410870961, "grad_norm": 0.19382789731025696, "learning_rate": 2.8270547945205484e-05, "loss": 1.6033, "step": 1269 }, { "epoch": 0.4348384335544618, "grad_norm": 0.19100964069366455, "learning_rate": 2.825342465753425e-05, "loss": 1.6516, "step": 1270 }, { "epoch": 0.4351808260218275, "grad_norm": 0.2201157510280609, "learning_rate": 2.8236301369863017e-05, "loss": 1.5771, "step": 1271 }, { "epoch": 0.43552321848919323, "grad_norm": 0.2077137529850006, "learning_rate": 2.8219178082191783e-05, "loss": 1.6371, "step": 1272 }, { "epoch": 0.43586561095655896, "grad_norm": 0.19701369106769562, "learning_rate": 2.820205479452055e-05, "loss": 1.6288, "step": 1273 }, { "epoch": 0.4362080034239247, "grad_norm": 0.22322522103786469, "learning_rate": 2.8184931506849316e-05, "loss": 1.5982, "step": 1274 }, { "epoch": 0.4365503958912904, "grad_norm": 0.1959504783153534, "learning_rate": 2.8167808219178083e-05, "loss": 1.6294, "step": 1275 }, { "epoch": 0.43689278835865614, "grad_norm": 0.19646722078323364, "learning_rate": 2.815068493150685e-05, "loss": 1.5479, "step": 1276 }, { "epoch": 0.4372351808260218, "grad_norm": 0.22708742320537567, "learning_rate": 2.8133561643835616e-05, "loss": 1.5201, "step": 1277 }, { "epoch": 0.43757757329338753, "grad_norm": 0.1933313012123108, "learning_rate": 2.8116438356164386e-05, "loss": 1.598, "step": 1278 }, { "epoch": 0.43791996576075326, "grad_norm": 0.2116461992263794, "learning_rate": 2.8099315068493152e-05, "loss": 1.6077, "step": 1279 }, { "epoch": 0.438262358228119, "grad_norm": 0.21244221925735474, "learning_rate": 2.808219178082192e-05, "loss": 1.563, "step": 1280 }, { "epoch": 0.4386047506954847, "grad_norm": 0.20757241547107697, "learning_rate": 2.8065068493150685e-05, "loss": 1.5431, "step": 1281 }, { "epoch": 0.43894714316285044, "grad_norm": 0.2152397781610489, "learning_rate": 2.8047945205479452e-05, "loss": 1.6359, "step": 1282 }, { "epoch": 0.4392895356302161, "grad_norm": 0.19094443321228027, "learning_rate": 2.803082191780822e-05, "loss": 1.5026, "step": 1283 }, { "epoch": 0.43963192809758184, "grad_norm": 0.26308658719062805, "learning_rate": 2.8013698630136985e-05, "loss": 1.5503, "step": 1284 }, { "epoch": 0.43997432056494756, "grad_norm": 0.1914006620645523, "learning_rate": 2.799657534246575e-05, "loss": 1.6205, "step": 1285 }, { "epoch": 0.4403167130323133, "grad_norm": 0.2079574316740036, "learning_rate": 2.7979452054794518e-05, "loss": 1.414, "step": 1286 }, { "epoch": 0.440659105499679, "grad_norm": 0.24444571137428284, "learning_rate": 2.796232876712329e-05, "loss": 1.6544, "step": 1287 }, { "epoch": 0.44100149796704474, "grad_norm": 0.20670345425605774, "learning_rate": 2.7945205479452058e-05, "loss": 1.588, "step": 1288 }, { "epoch": 0.44134389043441047, "grad_norm": 0.1899344027042389, "learning_rate": 2.7928082191780824e-05, "loss": 1.4741, "step": 1289 }, { "epoch": 0.44168628290177614, "grad_norm": 0.2112971991300583, "learning_rate": 2.791095890410959e-05, "loss": 1.6707, "step": 1290 }, { "epoch": 0.44202867536914187, "grad_norm": 0.2099982053041458, "learning_rate": 2.7893835616438357e-05, "loss": 1.6776, "step": 1291 }, { "epoch": 0.4423710678365076, "grad_norm": 0.2167626917362213, "learning_rate": 2.7876712328767124e-05, "loss": 1.6762, "step": 1292 }, { "epoch": 0.4427134603038733, "grad_norm": 0.2255111187696457, "learning_rate": 2.785958904109589e-05, "loss": 1.6216, "step": 1293 }, { "epoch": 0.44305585277123904, "grad_norm": 0.2023368775844574, "learning_rate": 2.7842465753424657e-05, "loss": 1.6619, "step": 1294 }, { "epoch": 0.44339824523860477, "grad_norm": 0.21878039836883545, "learning_rate": 2.7825342465753424e-05, "loss": 1.5969, "step": 1295 }, { "epoch": 0.44374063770597044, "grad_norm": 0.21872608363628387, "learning_rate": 2.7808219178082197e-05, "loss": 1.6216, "step": 1296 }, { "epoch": 0.44408303017333617, "grad_norm": 0.18493634462356567, "learning_rate": 2.7791095890410963e-05, "loss": 1.5588, "step": 1297 }, { "epoch": 0.4444254226407019, "grad_norm": 0.20624420046806335, "learning_rate": 2.777397260273973e-05, "loss": 1.6428, "step": 1298 }, { "epoch": 0.4447678151080676, "grad_norm": 0.19005556404590607, "learning_rate": 2.7756849315068496e-05, "loss": 1.5538, "step": 1299 }, { "epoch": 0.44511020757543335, "grad_norm": 0.1953173130750656, "learning_rate": 2.7739726027397263e-05, "loss": 1.5859, "step": 1300 }, { "epoch": 0.4454526000427991, "grad_norm": 0.19703419506549835, "learning_rate": 2.772260273972603e-05, "loss": 1.4955, "step": 1301 }, { "epoch": 0.4457949925101648, "grad_norm": 0.21384094655513763, "learning_rate": 2.7705479452054796e-05, "loss": 1.63, "step": 1302 }, { "epoch": 0.44613738497753047, "grad_norm": 0.21279188990592957, "learning_rate": 2.7688356164383562e-05, "loss": 1.611, "step": 1303 }, { "epoch": 0.4464797774448962, "grad_norm": 0.19739080965518951, "learning_rate": 2.767123287671233e-05, "loss": 1.6044, "step": 1304 }, { "epoch": 0.4468221699122619, "grad_norm": 0.20460259914398193, "learning_rate": 2.76541095890411e-05, "loss": 1.6389, "step": 1305 }, { "epoch": 0.44716456237962765, "grad_norm": 0.20864294469356537, "learning_rate": 2.7636986301369865e-05, "loss": 1.5579, "step": 1306 }, { "epoch": 0.4475069548469934, "grad_norm": 0.20070050656795502, "learning_rate": 2.7619863013698632e-05, "loss": 1.7116, "step": 1307 }, { "epoch": 0.4478493473143591, "grad_norm": 0.22353129088878632, "learning_rate": 2.76027397260274e-05, "loss": 1.5728, "step": 1308 }, { "epoch": 0.44819173978172483, "grad_norm": 0.18899302184581757, "learning_rate": 2.7585616438356165e-05, "loss": 1.6323, "step": 1309 }, { "epoch": 0.4485341322490905, "grad_norm": 0.2200808972120285, "learning_rate": 2.756849315068493e-05, "loss": 1.6239, "step": 1310 }, { "epoch": 0.4488765247164562, "grad_norm": 0.22361575067043304, "learning_rate": 2.7551369863013698e-05, "loss": 1.6851, "step": 1311 }, { "epoch": 0.44921891718382195, "grad_norm": 0.19749444723129272, "learning_rate": 2.7534246575342465e-05, "loss": 1.588, "step": 1312 }, { "epoch": 0.4495613096511877, "grad_norm": 0.19690024852752686, "learning_rate": 2.751712328767123e-05, "loss": 1.6481, "step": 1313 }, { "epoch": 0.4499037021185534, "grad_norm": 0.21350185573101044, "learning_rate": 2.7500000000000004e-05, "loss": 1.5585, "step": 1314 }, { "epoch": 0.45024609458591913, "grad_norm": 0.20340736210346222, "learning_rate": 2.748287671232877e-05, "loss": 1.5896, "step": 1315 }, { "epoch": 0.4505884870532848, "grad_norm": 0.24820800125598907, "learning_rate": 2.7465753424657537e-05, "loss": 1.6225, "step": 1316 }, { "epoch": 0.45093087952065053, "grad_norm": 0.20881463587284088, "learning_rate": 2.7448630136986304e-05, "loss": 1.6113, "step": 1317 }, { "epoch": 0.45127327198801626, "grad_norm": 0.20027029514312744, "learning_rate": 2.743150684931507e-05, "loss": 1.5877, "step": 1318 }, { "epoch": 0.451615664455382, "grad_norm": 0.22245590388774872, "learning_rate": 2.7414383561643837e-05, "loss": 1.7299, "step": 1319 }, { "epoch": 0.4519580569227477, "grad_norm": 0.20093341171741486, "learning_rate": 2.7397260273972603e-05, "loss": 1.6144, "step": 1320 }, { "epoch": 0.45230044939011343, "grad_norm": 0.19276201725006104, "learning_rate": 2.738013698630137e-05, "loss": 1.5727, "step": 1321 }, { "epoch": 0.45264284185747916, "grad_norm": 0.20201914012432098, "learning_rate": 2.7363013698630137e-05, "loss": 1.6086, "step": 1322 }, { "epoch": 0.45298523432484483, "grad_norm": 0.2176027148962021, "learning_rate": 2.7345890410958906e-05, "loss": 1.6025, "step": 1323 }, { "epoch": 0.45332762679221056, "grad_norm": 0.2095157951116562, "learning_rate": 2.7328767123287673e-05, "loss": 1.6018, "step": 1324 }, { "epoch": 0.4536700192595763, "grad_norm": 0.20671874284744263, "learning_rate": 2.731164383561644e-05, "loss": 1.6039, "step": 1325 }, { "epoch": 0.454012411726942, "grad_norm": 0.19558106362819672, "learning_rate": 2.7294520547945206e-05, "loss": 1.6189, "step": 1326 }, { "epoch": 0.45435480419430774, "grad_norm": 0.21937154233455658, "learning_rate": 2.7277397260273973e-05, "loss": 1.6728, "step": 1327 }, { "epoch": 0.45469719666167346, "grad_norm": 0.1876019686460495, "learning_rate": 2.726027397260274e-05, "loss": 1.545, "step": 1328 }, { "epoch": 0.45503958912903913, "grad_norm": 0.19967332482337952, "learning_rate": 2.7243150684931506e-05, "loss": 1.6772, "step": 1329 }, { "epoch": 0.45538198159640486, "grad_norm": 0.20308998227119446, "learning_rate": 2.7226027397260272e-05, "loss": 1.6168, "step": 1330 }, { "epoch": 0.4557243740637706, "grad_norm": 0.20742812752723694, "learning_rate": 2.7208904109589045e-05, "loss": 1.5256, "step": 1331 }, { "epoch": 0.4560667665311363, "grad_norm": 0.22549614310264587, "learning_rate": 2.7191780821917812e-05, "loss": 1.6025, "step": 1332 }, { "epoch": 0.45640915899850204, "grad_norm": 0.20058013498783112, "learning_rate": 2.717465753424658e-05, "loss": 1.5834, "step": 1333 }, { "epoch": 0.45675155146586777, "grad_norm": 0.20315252244472504, "learning_rate": 2.7157534246575345e-05, "loss": 1.6495, "step": 1334 }, { "epoch": 0.4570939439332335, "grad_norm": 0.20194421708583832, "learning_rate": 2.714041095890411e-05, "loss": 1.7435, "step": 1335 }, { "epoch": 0.45743633640059916, "grad_norm": 0.20864975452423096, "learning_rate": 2.7123287671232878e-05, "loss": 1.5242, "step": 1336 }, { "epoch": 0.4577787288679649, "grad_norm": 0.2014782726764679, "learning_rate": 2.7106164383561645e-05, "loss": 1.6425, "step": 1337 }, { "epoch": 0.4581211213353306, "grad_norm": 0.21706129610538483, "learning_rate": 2.708904109589041e-05, "loss": 1.5854, "step": 1338 }, { "epoch": 0.45846351380269634, "grad_norm": 0.2211795449256897, "learning_rate": 2.7071917808219178e-05, "loss": 1.6645, "step": 1339 }, { "epoch": 0.45880590627006207, "grad_norm": 0.2053402215242386, "learning_rate": 2.7054794520547947e-05, "loss": 1.535, "step": 1340 }, { "epoch": 0.4591482987374278, "grad_norm": 0.2466185986995697, "learning_rate": 2.7037671232876714e-05, "loss": 1.5616, "step": 1341 }, { "epoch": 0.4594906912047935, "grad_norm": 0.21305550634860992, "learning_rate": 2.702054794520548e-05, "loss": 1.5819, "step": 1342 }, { "epoch": 0.4598330836721592, "grad_norm": 0.2041912078857422, "learning_rate": 2.7003424657534247e-05, "loss": 1.5408, "step": 1343 }, { "epoch": 0.4601754761395249, "grad_norm": 0.21606750786304474, "learning_rate": 2.6986301369863014e-05, "loss": 1.6042, "step": 1344 }, { "epoch": 0.46051786860689065, "grad_norm": 0.22272594273090363, "learning_rate": 2.696917808219178e-05, "loss": 1.5928, "step": 1345 }, { "epoch": 0.46086026107425637, "grad_norm": 0.2871193587779999, "learning_rate": 2.6952054794520547e-05, "loss": 1.5159, "step": 1346 }, { "epoch": 0.4612026535416221, "grad_norm": 0.23473390936851501, "learning_rate": 2.6934931506849313e-05, "loss": 1.6631, "step": 1347 }, { "epoch": 0.4615450460089878, "grad_norm": 0.2140572965145111, "learning_rate": 2.691780821917808e-05, "loss": 1.5886, "step": 1348 }, { "epoch": 0.4618874384763535, "grad_norm": 0.19794413447380066, "learning_rate": 2.6900684931506853e-05, "loss": 1.5668, "step": 1349 }, { "epoch": 0.4622298309437192, "grad_norm": 0.208235502243042, "learning_rate": 2.688356164383562e-05, "loss": 1.7186, "step": 1350 }, { "epoch": 0.46257222341108495, "grad_norm": 0.22605518996715546, "learning_rate": 2.6866438356164386e-05, "loss": 1.7194, "step": 1351 }, { "epoch": 0.4629146158784507, "grad_norm": 0.22044365108013153, "learning_rate": 2.6849315068493153e-05, "loss": 1.5917, "step": 1352 }, { "epoch": 0.4632570083458164, "grad_norm": 0.20217624306678772, "learning_rate": 2.683219178082192e-05, "loss": 1.6612, "step": 1353 }, { "epoch": 0.4635994008131821, "grad_norm": 0.21885894238948822, "learning_rate": 2.6815068493150686e-05, "loss": 1.6068, "step": 1354 }, { "epoch": 0.46394179328054785, "grad_norm": 0.21821601688861847, "learning_rate": 2.6797945205479452e-05, "loss": 1.7196, "step": 1355 }, { "epoch": 0.4642841857479135, "grad_norm": 0.210972398519516, "learning_rate": 2.678082191780822e-05, "loss": 1.6111, "step": 1356 }, { "epoch": 0.46462657821527925, "grad_norm": 0.22368817031383514, "learning_rate": 2.6763698630136985e-05, "loss": 1.5739, "step": 1357 }, { "epoch": 0.464968970682645, "grad_norm": 0.2223745435476303, "learning_rate": 2.674657534246576e-05, "loss": 1.4581, "step": 1358 }, { "epoch": 0.4653113631500107, "grad_norm": 0.21598182618618011, "learning_rate": 2.6729452054794525e-05, "loss": 1.5658, "step": 1359 }, { "epoch": 0.46565375561737643, "grad_norm": 0.206614151597023, "learning_rate": 2.671232876712329e-05, "loss": 1.6214, "step": 1360 }, { "epoch": 0.46599614808474216, "grad_norm": 0.23197421431541443, "learning_rate": 2.6695205479452058e-05, "loss": 1.7039, "step": 1361 }, { "epoch": 0.4663385405521078, "grad_norm": 0.20117156207561493, "learning_rate": 2.6678082191780825e-05, "loss": 1.595, "step": 1362 }, { "epoch": 0.46668093301947355, "grad_norm": 0.2037356048822403, "learning_rate": 2.666095890410959e-05, "loss": 1.6352, "step": 1363 }, { "epoch": 0.4670233254868393, "grad_norm": 0.21980658173561096, "learning_rate": 2.6643835616438358e-05, "loss": 1.7779, "step": 1364 }, { "epoch": 0.467365717954205, "grad_norm": 0.22298753261566162, "learning_rate": 2.6626712328767124e-05, "loss": 1.7564, "step": 1365 }, { "epoch": 0.46770811042157073, "grad_norm": 0.21547992527484894, "learning_rate": 2.660958904109589e-05, "loss": 1.5624, "step": 1366 }, { "epoch": 0.46805050288893646, "grad_norm": 0.21060241758823395, "learning_rate": 2.659246575342466e-05, "loss": 1.6909, "step": 1367 }, { "epoch": 0.4683928953563022, "grad_norm": 0.20602135360240936, "learning_rate": 2.6575342465753427e-05, "loss": 1.5395, "step": 1368 }, { "epoch": 0.46873528782366786, "grad_norm": 0.2040865272283554, "learning_rate": 2.6558219178082194e-05, "loss": 1.6856, "step": 1369 }, { "epoch": 0.4690776802910336, "grad_norm": 0.19803740084171295, "learning_rate": 2.654109589041096e-05, "loss": 1.5303, "step": 1370 }, { "epoch": 0.4694200727583993, "grad_norm": 0.21603178977966309, "learning_rate": 2.6523972602739727e-05, "loss": 1.6497, "step": 1371 }, { "epoch": 0.46976246522576504, "grad_norm": 0.21988803148269653, "learning_rate": 2.6506849315068493e-05, "loss": 1.5718, "step": 1372 }, { "epoch": 0.47010485769313076, "grad_norm": 0.2292342185974121, "learning_rate": 2.648972602739726e-05, "loss": 1.6297, "step": 1373 }, { "epoch": 0.4704472501604965, "grad_norm": 0.2069769948720932, "learning_rate": 2.6472602739726026e-05, "loss": 1.693, "step": 1374 }, { "epoch": 0.47078964262786216, "grad_norm": 0.21185703575611115, "learning_rate": 2.6455479452054793e-05, "loss": 1.5414, "step": 1375 }, { "epoch": 0.4711320350952279, "grad_norm": 0.23838891088962555, "learning_rate": 2.6438356164383566e-05, "loss": 1.5319, "step": 1376 }, { "epoch": 0.4714744275625936, "grad_norm": 0.2091304212808609, "learning_rate": 2.6421232876712332e-05, "loss": 1.7099, "step": 1377 }, { "epoch": 0.47181682002995934, "grad_norm": 0.22799402475357056, "learning_rate": 2.64041095890411e-05, "loss": 1.6739, "step": 1378 }, { "epoch": 0.47215921249732506, "grad_norm": 0.2290544956922531, "learning_rate": 2.6386986301369866e-05, "loss": 1.585, "step": 1379 }, { "epoch": 0.4725016049646908, "grad_norm": 0.24033713340759277, "learning_rate": 2.6369863013698632e-05, "loss": 1.5915, "step": 1380 }, { "epoch": 0.4728439974320565, "grad_norm": 0.21024556457996368, "learning_rate": 2.63527397260274e-05, "loss": 1.6065, "step": 1381 }, { "epoch": 0.4731863898994222, "grad_norm": 0.21986503899097443, "learning_rate": 2.6335616438356165e-05, "loss": 1.6736, "step": 1382 }, { "epoch": 0.4735287823667879, "grad_norm": 0.25219112634658813, "learning_rate": 2.631849315068493e-05, "loss": 1.6389, "step": 1383 }, { "epoch": 0.47387117483415364, "grad_norm": 0.21284586191177368, "learning_rate": 2.6301369863013698e-05, "loss": 1.5864, "step": 1384 }, { "epoch": 0.47421356730151937, "grad_norm": 0.222655788064003, "learning_rate": 2.6284246575342468e-05, "loss": 1.7357, "step": 1385 }, { "epoch": 0.4745559597688851, "grad_norm": 0.21348018944263458, "learning_rate": 2.6267123287671235e-05, "loss": 1.5516, "step": 1386 }, { "epoch": 0.4748983522362508, "grad_norm": 0.24178721010684967, "learning_rate": 2.625e-05, "loss": 1.3957, "step": 1387 }, { "epoch": 0.47524074470361655, "grad_norm": 0.21366576850414276, "learning_rate": 2.6232876712328768e-05, "loss": 1.658, "step": 1388 }, { "epoch": 0.4755831371709822, "grad_norm": 0.2072124183177948, "learning_rate": 2.6215753424657534e-05, "loss": 1.6279, "step": 1389 }, { "epoch": 0.47592552963834794, "grad_norm": 0.21668630838394165, "learning_rate": 2.61986301369863e-05, "loss": 1.6003, "step": 1390 }, { "epoch": 0.47626792210571367, "grad_norm": 0.22862207889556885, "learning_rate": 2.6181506849315067e-05, "loss": 1.5395, "step": 1391 }, { "epoch": 0.4766103145730794, "grad_norm": 0.21089622378349304, "learning_rate": 2.6164383561643834e-05, "loss": 1.6915, "step": 1392 }, { "epoch": 0.4769527070404451, "grad_norm": 0.21379773318767548, "learning_rate": 2.61472602739726e-05, "loss": 1.5678, "step": 1393 }, { "epoch": 0.47729509950781085, "grad_norm": 0.22531121969223022, "learning_rate": 2.6130136986301374e-05, "loss": 1.6689, "step": 1394 }, { "epoch": 0.4776374919751765, "grad_norm": 0.19806623458862305, "learning_rate": 2.611301369863014e-05, "loss": 1.5663, "step": 1395 }, { "epoch": 0.47797988444254225, "grad_norm": 0.22509703040122986, "learning_rate": 2.6095890410958907e-05, "loss": 1.704, "step": 1396 }, { "epoch": 0.478322276909908, "grad_norm": 0.219950333237648, "learning_rate": 2.6078767123287673e-05, "loss": 1.6441, "step": 1397 }, { "epoch": 0.4786646693772737, "grad_norm": 0.20803961157798767, "learning_rate": 2.606164383561644e-05, "loss": 1.6294, "step": 1398 }, { "epoch": 0.4790070618446394, "grad_norm": 0.22049099206924438, "learning_rate": 2.6044520547945206e-05, "loss": 1.6555, "step": 1399 }, { "epoch": 0.47934945431200515, "grad_norm": 0.2052542269229889, "learning_rate": 2.6027397260273973e-05, "loss": 1.5431, "step": 1400 }, { "epoch": 0.4796918467793709, "grad_norm": 0.20496866106987, "learning_rate": 2.601027397260274e-05, "loss": 1.5238, "step": 1401 }, { "epoch": 0.48003423924673655, "grad_norm": 0.2119813710451126, "learning_rate": 2.5993150684931506e-05, "loss": 1.599, "step": 1402 }, { "epoch": 0.4803766317141023, "grad_norm": 0.2276984453201294, "learning_rate": 2.597602739726028e-05, "loss": 1.5697, "step": 1403 }, { "epoch": 0.480719024181468, "grad_norm": 0.2304016798734665, "learning_rate": 2.5958904109589046e-05, "loss": 1.5967, "step": 1404 }, { "epoch": 0.48106141664883373, "grad_norm": 0.23187491297721863, "learning_rate": 2.5941780821917812e-05, "loss": 1.6841, "step": 1405 }, { "epoch": 0.48140380911619945, "grad_norm": 0.20764100551605225, "learning_rate": 2.592465753424658e-05, "loss": 1.494, "step": 1406 }, { "epoch": 0.4817462015835652, "grad_norm": 0.22735796868801117, "learning_rate": 2.5907534246575345e-05, "loss": 1.5593, "step": 1407 }, { "epoch": 0.48208859405093085, "grad_norm": 0.21746474504470825, "learning_rate": 2.589041095890411e-05, "loss": 1.5733, "step": 1408 }, { "epoch": 0.4824309865182966, "grad_norm": 0.2104204297065735, "learning_rate": 2.5873287671232878e-05, "loss": 1.6064, "step": 1409 }, { "epoch": 0.4827733789856623, "grad_norm": 0.22824501991271973, "learning_rate": 2.5856164383561645e-05, "loss": 1.6345, "step": 1410 }, { "epoch": 0.48311577145302803, "grad_norm": 0.21339678764343262, "learning_rate": 2.5839041095890408e-05, "loss": 1.6535, "step": 1411 }, { "epoch": 0.48345816392039376, "grad_norm": 0.20829306542873383, "learning_rate": 2.582191780821918e-05, "loss": 1.6572, "step": 1412 }, { "epoch": 0.4838005563877595, "grad_norm": 0.24765554070472717, "learning_rate": 2.5804794520547948e-05, "loss": 1.6574, "step": 1413 }, { "epoch": 0.4841429488551252, "grad_norm": 0.23106792569160461, "learning_rate": 2.5787671232876714e-05, "loss": 1.6172, "step": 1414 }, { "epoch": 0.4844853413224909, "grad_norm": 0.21208971738815308, "learning_rate": 2.577054794520548e-05, "loss": 1.5387, "step": 1415 }, { "epoch": 0.4848277337898566, "grad_norm": 0.2001924067735672, "learning_rate": 2.5753424657534247e-05, "loss": 1.5821, "step": 1416 }, { "epoch": 0.48517012625722233, "grad_norm": 0.20335550606250763, "learning_rate": 2.5736301369863014e-05, "loss": 1.5617, "step": 1417 }, { "epoch": 0.48551251872458806, "grad_norm": 0.20830662548542023, "learning_rate": 2.571917808219178e-05, "loss": 1.5098, "step": 1418 }, { "epoch": 0.4858549111919538, "grad_norm": 0.2104499191045761, "learning_rate": 2.5702054794520547e-05, "loss": 1.5727, "step": 1419 }, { "epoch": 0.4861973036593195, "grad_norm": 0.22581154108047485, "learning_rate": 2.568493150684932e-05, "loss": 1.5459, "step": 1420 }, { "epoch": 0.48653969612668524, "grad_norm": 0.23391151428222656, "learning_rate": 2.5667808219178087e-05, "loss": 1.5958, "step": 1421 }, { "epoch": 0.4868820885940509, "grad_norm": 0.20863008499145508, "learning_rate": 2.5650684931506853e-05, "loss": 1.582, "step": 1422 }, { "epoch": 0.48722448106141664, "grad_norm": 0.21236760914325714, "learning_rate": 2.563356164383562e-05, "loss": 1.5088, "step": 1423 }, { "epoch": 0.48756687352878236, "grad_norm": 0.20150718092918396, "learning_rate": 2.5616438356164386e-05, "loss": 1.6103, "step": 1424 }, { "epoch": 0.4879092659961481, "grad_norm": 0.22350865602493286, "learning_rate": 2.5599315068493153e-05, "loss": 1.4807, "step": 1425 }, { "epoch": 0.4882516584635138, "grad_norm": 0.22096654772758484, "learning_rate": 2.558219178082192e-05, "loss": 1.6289, "step": 1426 }, { "epoch": 0.48859405093087954, "grad_norm": 0.2250150591135025, "learning_rate": 2.5565068493150686e-05, "loss": 1.5438, "step": 1427 }, { "epoch": 0.4889364433982452, "grad_norm": 0.22916243970394135, "learning_rate": 2.5547945205479452e-05, "loss": 1.5676, "step": 1428 }, { "epoch": 0.48927883586561094, "grad_norm": 0.25838109850883484, "learning_rate": 2.5530821917808222e-05, "loss": 1.5118, "step": 1429 }, { "epoch": 0.48962122833297667, "grad_norm": 0.23292064666748047, "learning_rate": 2.551369863013699e-05, "loss": 1.6221, "step": 1430 }, { "epoch": 0.4899636208003424, "grad_norm": 0.24059920012950897, "learning_rate": 2.5496575342465755e-05, "loss": 1.6937, "step": 1431 }, { "epoch": 0.4903060132677081, "grad_norm": 0.22448128461837769, "learning_rate": 2.547945205479452e-05, "loss": 1.623, "step": 1432 }, { "epoch": 0.49064840573507384, "grad_norm": 0.2211628258228302, "learning_rate": 2.5462328767123288e-05, "loss": 1.6279, "step": 1433 }, { "epoch": 0.49099079820243957, "grad_norm": 0.23198798298835754, "learning_rate": 2.5445205479452055e-05, "loss": 1.575, "step": 1434 }, { "epoch": 0.49133319066980524, "grad_norm": 0.2109183520078659, "learning_rate": 2.542808219178082e-05, "loss": 1.5957, "step": 1435 }, { "epoch": 0.49167558313717097, "grad_norm": 0.2549244463443756, "learning_rate": 2.5410958904109588e-05, "loss": 1.4743, "step": 1436 }, { "epoch": 0.4920179756045367, "grad_norm": 0.23161399364471436, "learning_rate": 2.5393835616438354e-05, "loss": 1.6795, "step": 1437 }, { "epoch": 0.4923603680719024, "grad_norm": 0.20496587455272675, "learning_rate": 2.5376712328767128e-05, "loss": 1.6192, "step": 1438 }, { "epoch": 0.49270276053926815, "grad_norm": 0.21225783228874207, "learning_rate": 2.5359589041095894e-05, "loss": 1.6066, "step": 1439 }, { "epoch": 0.4930451530066339, "grad_norm": 0.23768381774425507, "learning_rate": 2.534246575342466e-05, "loss": 1.6908, "step": 1440 }, { "epoch": 0.49338754547399954, "grad_norm": 0.2039780616760254, "learning_rate": 2.5325342465753427e-05, "loss": 1.4414, "step": 1441 }, { "epoch": 0.49372993794136527, "grad_norm": 0.23023417592048645, "learning_rate": 2.5308219178082194e-05, "loss": 1.6761, "step": 1442 }, { "epoch": 0.494072330408731, "grad_norm": 0.21975712478160858, "learning_rate": 2.529109589041096e-05, "loss": 1.5855, "step": 1443 }, { "epoch": 0.4944147228760967, "grad_norm": 0.23066382110118866, "learning_rate": 2.5273972602739727e-05, "loss": 1.6893, "step": 1444 }, { "epoch": 0.49475711534346245, "grad_norm": 0.22831125557422638, "learning_rate": 2.5256849315068493e-05, "loss": 1.5991, "step": 1445 }, { "epoch": 0.4950995078108282, "grad_norm": 0.20304207503795624, "learning_rate": 2.523972602739726e-05, "loss": 1.5302, "step": 1446 }, { "epoch": 0.4954419002781939, "grad_norm": 0.21516317129135132, "learning_rate": 2.522260273972603e-05, "loss": 1.6137, "step": 1447 }, { "epoch": 0.4957842927455596, "grad_norm": 0.21381838619709015, "learning_rate": 2.5205479452054796e-05, "loss": 1.588, "step": 1448 }, { "epoch": 0.4961266852129253, "grad_norm": 0.21585769951343536, "learning_rate": 2.5188356164383563e-05, "loss": 1.6271, "step": 1449 }, { "epoch": 0.496469077680291, "grad_norm": 0.21562549471855164, "learning_rate": 2.517123287671233e-05, "loss": 1.4748, "step": 1450 }, { "epoch": 0.49681147014765675, "grad_norm": 0.24012723565101624, "learning_rate": 2.5154109589041096e-05, "loss": 1.5409, "step": 1451 }, { "epoch": 0.4971538626150225, "grad_norm": 0.22319619357585907, "learning_rate": 2.5136986301369862e-05, "loss": 1.5359, "step": 1452 }, { "epoch": 0.4974962550823882, "grad_norm": 0.23002086579799652, "learning_rate": 2.511986301369863e-05, "loss": 1.6345, "step": 1453 }, { "epoch": 0.49783864754975393, "grad_norm": 0.23329651355743408, "learning_rate": 2.5102739726027395e-05, "loss": 1.5783, "step": 1454 }, { "epoch": 0.4981810400171196, "grad_norm": 0.24626342952251434, "learning_rate": 2.5085616438356162e-05, "loss": 1.7721, "step": 1455 }, { "epoch": 0.49852343248448533, "grad_norm": 0.22375771403312683, "learning_rate": 2.5068493150684935e-05, "loss": 1.5924, "step": 1456 }, { "epoch": 0.49886582495185106, "grad_norm": 0.2185361087322235, "learning_rate": 2.50513698630137e-05, "loss": 1.5471, "step": 1457 }, { "epoch": 0.4992082174192168, "grad_norm": 0.22683893144130707, "learning_rate": 2.5034246575342468e-05, "loss": 1.6316, "step": 1458 }, { "epoch": 0.4995506098865825, "grad_norm": 0.22761863470077515, "learning_rate": 2.5017123287671235e-05, "loss": 1.5336, "step": 1459 }, { "epoch": 0.49989300235394823, "grad_norm": 0.23298339545726776, "learning_rate": 2.5e-05, "loss": 1.5767, "step": 1460 }, { "epoch": 0.500235394821314, "grad_norm": 0.22829507291316986, "learning_rate": 2.4982876712328768e-05, "loss": 1.6258, "step": 1461 }, { "epoch": 0.5005777872886796, "grad_norm": 0.21002766489982605, "learning_rate": 2.4965753424657534e-05, "loss": 1.6323, "step": 1462 }, { "epoch": 0.5009201797560454, "grad_norm": 0.24211904406547546, "learning_rate": 2.4948630136986304e-05, "loss": 1.7135, "step": 1463 }, { "epoch": 0.5012625722234111, "grad_norm": 0.22013267874717712, "learning_rate": 2.493150684931507e-05, "loss": 1.6538, "step": 1464 }, { "epoch": 0.5016049646907768, "grad_norm": 0.24439381062984467, "learning_rate": 2.4914383561643837e-05, "loss": 1.6416, "step": 1465 }, { "epoch": 0.5019473571581425, "grad_norm": 0.23885585367679596, "learning_rate": 2.4897260273972604e-05, "loss": 1.6261, "step": 1466 }, { "epoch": 0.5022897496255082, "grad_norm": 0.24935171008110046, "learning_rate": 2.4880136986301374e-05, "loss": 1.5769, "step": 1467 }, { "epoch": 0.502632142092874, "grad_norm": 0.225867360830307, "learning_rate": 2.486301369863014e-05, "loss": 1.6436, "step": 1468 }, { "epoch": 0.5029745345602397, "grad_norm": 0.22034500539302826, "learning_rate": 2.4845890410958907e-05, "loss": 1.535, "step": 1469 }, { "epoch": 0.5033169270276054, "grad_norm": 0.20695674419403076, "learning_rate": 2.4828767123287673e-05, "loss": 1.6487, "step": 1470 }, { "epoch": 0.5036593194949711, "grad_norm": 0.22085557878017426, "learning_rate": 2.481164383561644e-05, "loss": 1.5682, "step": 1471 }, { "epoch": 0.5040017119623368, "grad_norm": 0.21975381672382355, "learning_rate": 2.4794520547945206e-05, "loss": 1.683, "step": 1472 }, { "epoch": 0.5043441044297026, "grad_norm": 0.22500766813755035, "learning_rate": 2.4777397260273973e-05, "loss": 1.5505, "step": 1473 }, { "epoch": 0.5046864968970682, "grad_norm": 0.21636207401752472, "learning_rate": 2.476027397260274e-05, "loss": 1.6434, "step": 1474 }, { "epoch": 0.505028889364434, "grad_norm": 0.25084245204925537, "learning_rate": 2.4743150684931506e-05, "loss": 1.616, "step": 1475 }, { "epoch": 0.5053712818317997, "grad_norm": 0.2152504026889801, "learning_rate": 2.4726027397260276e-05, "loss": 1.5895, "step": 1476 }, { "epoch": 0.5057136742991655, "grad_norm": 0.24195416271686554, "learning_rate": 2.4708904109589042e-05, "loss": 1.532, "step": 1477 }, { "epoch": 0.5060560667665311, "grad_norm": 0.21612657606601715, "learning_rate": 2.469178082191781e-05, "loss": 1.5766, "step": 1478 }, { "epoch": 0.5063984592338968, "grad_norm": 0.22496864199638367, "learning_rate": 2.4674657534246575e-05, "loss": 1.648, "step": 1479 }, { "epoch": 0.5067408517012626, "grad_norm": 0.2303045243024826, "learning_rate": 2.4657534246575342e-05, "loss": 1.5609, "step": 1480 }, { "epoch": 0.5070832441686283, "grad_norm": 0.22485658526420593, "learning_rate": 2.4640410958904112e-05, "loss": 1.7207, "step": 1481 }, { "epoch": 0.507425636635994, "grad_norm": 0.21428076922893524, "learning_rate": 2.4623287671232878e-05, "loss": 1.6194, "step": 1482 }, { "epoch": 0.5077680291033597, "grad_norm": 0.21910762786865234, "learning_rate": 2.4606164383561645e-05, "loss": 1.6274, "step": 1483 }, { "epoch": 0.5081104215707254, "grad_norm": 0.21260958909988403, "learning_rate": 2.458904109589041e-05, "loss": 1.6357, "step": 1484 }, { "epoch": 0.5084528140380912, "grad_norm": 0.22308754920959473, "learning_rate": 2.457191780821918e-05, "loss": 1.6037, "step": 1485 }, { "epoch": 0.5087952065054568, "grad_norm": 0.22250205278396606, "learning_rate": 2.4554794520547948e-05, "loss": 1.5656, "step": 1486 }, { "epoch": 0.5091375989728226, "grad_norm": 0.20564071834087372, "learning_rate": 2.4537671232876714e-05, "loss": 1.5467, "step": 1487 }, { "epoch": 0.5094799914401883, "grad_norm": 0.20976704359054565, "learning_rate": 2.452054794520548e-05, "loss": 1.6678, "step": 1488 }, { "epoch": 0.5098223839075541, "grad_norm": 0.2512046992778778, "learning_rate": 2.450342465753425e-05, "loss": 1.6654, "step": 1489 }, { "epoch": 0.5101647763749197, "grad_norm": 0.2132597714662552, "learning_rate": 2.4486301369863017e-05, "loss": 1.4906, "step": 1490 }, { "epoch": 0.5105071688422854, "grad_norm": 0.23324142396450043, "learning_rate": 2.4469178082191784e-05, "loss": 1.563, "step": 1491 }, { "epoch": 0.5108495613096512, "grad_norm": 0.22722558677196503, "learning_rate": 2.445205479452055e-05, "loss": 1.6818, "step": 1492 }, { "epoch": 0.5111919537770169, "grad_norm": 0.21367134153842926, "learning_rate": 2.4434931506849317e-05, "loss": 1.5429, "step": 1493 }, { "epoch": 0.5115343462443827, "grad_norm": 0.219669371843338, "learning_rate": 2.4417808219178083e-05, "loss": 1.6342, "step": 1494 }, { "epoch": 0.5118767387117483, "grad_norm": 0.21167896687984467, "learning_rate": 2.440068493150685e-05, "loss": 1.632, "step": 1495 }, { "epoch": 0.5122191311791141, "grad_norm": 0.21551838517189026, "learning_rate": 2.4383561643835616e-05, "loss": 1.5618, "step": 1496 }, { "epoch": 0.5125615236464798, "grad_norm": 0.20259207487106323, "learning_rate": 2.4366438356164383e-05, "loss": 1.5024, "step": 1497 }, { "epoch": 0.5129039161138454, "grad_norm": 0.24249635636806488, "learning_rate": 2.4349315068493153e-05, "loss": 1.7068, "step": 1498 }, { "epoch": 0.5132463085812112, "grad_norm": 0.2337043583393097, "learning_rate": 2.433219178082192e-05, "loss": 1.4958, "step": 1499 }, { "epoch": 0.5135887010485769, "grad_norm": 0.2100091427564621, "learning_rate": 2.4315068493150686e-05, "loss": 1.6967, "step": 1500 }, { "epoch": 0.5139310935159427, "grad_norm": 0.220554918050766, "learning_rate": 2.4297945205479452e-05, "loss": 1.7446, "step": 1501 }, { "epoch": 0.5142734859833084, "grad_norm": 0.22338004410266876, "learning_rate": 2.428082191780822e-05, "loss": 1.6547, "step": 1502 }, { "epoch": 0.5146158784506741, "grad_norm": 0.24810214340686798, "learning_rate": 2.426369863013699e-05, "loss": 1.6717, "step": 1503 }, { "epoch": 0.5149582709180398, "grad_norm": 0.22453364729881287, "learning_rate": 2.4246575342465755e-05, "loss": 1.5688, "step": 1504 }, { "epoch": 0.5153006633854055, "grad_norm": 0.22125643491744995, "learning_rate": 2.4229452054794522e-05, "loss": 1.5725, "step": 1505 }, { "epoch": 0.5156430558527713, "grad_norm": 0.2346554696559906, "learning_rate": 2.4212328767123288e-05, "loss": 1.5408, "step": 1506 }, { "epoch": 0.5159854483201369, "grad_norm": 0.222235769033432, "learning_rate": 2.4195205479452058e-05, "loss": 1.5709, "step": 1507 }, { "epoch": 0.5163278407875027, "grad_norm": 0.21163469552993774, "learning_rate": 2.4178082191780825e-05, "loss": 1.6618, "step": 1508 }, { "epoch": 0.5166702332548684, "grad_norm": 0.23221734166145325, "learning_rate": 2.416095890410959e-05, "loss": 1.6576, "step": 1509 }, { "epoch": 0.5170126257222342, "grad_norm": 0.23137521743774414, "learning_rate": 2.4143835616438358e-05, "loss": 1.7128, "step": 1510 }, { "epoch": 0.5173550181895998, "grad_norm": 0.220950186252594, "learning_rate": 2.4126712328767124e-05, "loss": 1.5834, "step": 1511 }, { "epoch": 0.5176974106569655, "grad_norm": 0.23572193086147308, "learning_rate": 2.410958904109589e-05, "loss": 1.677, "step": 1512 }, { "epoch": 0.5180398031243313, "grad_norm": 0.23055945336818695, "learning_rate": 2.4092465753424657e-05, "loss": 1.6201, "step": 1513 }, { "epoch": 0.518382195591697, "grad_norm": 0.22088348865509033, "learning_rate": 2.4075342465753424e-05, "loss": 1.5767, "step": 1514 }, { "epoch": 0.5187245880590627, "grad_norm": 0.2388024926185608, "learning_rate": 2.405821917808219e-05, "loss": 1.6488, "step": 1515 }, { "epoch": 0.5190669805264284, "grad_norm": 0.221141055226326, "learning_rate": 2.404109589041096e-05, "loss": 1.6144, "step": 1516 }, { "epoch": 0.5194093729937941, "grad_norm": 0.23438028991222382, "learning_rate": 2.4023972602739727e-05, "loss": 1.5776, "step": 1517 }, { "epoch": 0.5197517654611599, "grad_norm": 0.24299070239067078, "learning_rate": 2.4006849315068493e-05, "loss": 1.602, "step": 1518 }, { "epoch": 0.5200941579285255, "grad_norm": 0.23127304017543793, "learning_rate": 2.398972602739726e-05, "loss": 1.5393, "step": 1519 }, { "epoch": 0.5204365503958913, "grad_norm": 0.2256263941526413, "learning_rate": 2.3972602739726026e-05, "loss": 1.6197, "step": 1520 }, { "epoch": 0.520778942863257, "grad_norm": 0.22691130638122559, "learning_rate": 2.3955479452054796e-05, "loss": 1.5997, "step": 1521 }, { "epoch": 0.5211213353306228, "grad_norm": 0.218649223446846, "learning_rate": 2.3938356164383563e-05, "loss": 1.6767, "step": 1522 }, { "epoch": 0.5214637277979884, "grad_norm": 0.2421543449163437, "learning_rate": 2.392123287671233e-05, "loss": 1.6577, "step": 1523 }, { "epoch": 0.5218061202653541, "grad_norm": 0.19946743547916412, "learning_rate": 2.3904109589041096e-05, "loss": 1.5549, "step": 1524 }, { "epoch": 0.5221485127327199, "grad_norm": 0.22269977629184723, "learning_rate": 2.3886986301369866e-05, "loss": 1.6462, "step": 1525 }, { "epoch": 0.5224909052000856, "grad_norm": 0.22810249030590057, "learning_rate": 2.3869863013698632e-05, "loss": 1.6978, "step": 1526 }, { "epoch": 0.5228332976674513, "grad_norm": 0.22720137238502502, "learning_rate": 2.38527397260274e-05, "loss": 1.6101, "step": 1527 }, { "epoch": 0.523175690134817, "grad_norm": 0.2173788696527481, "learning_rate": 2.3835616438356165e-05, "loss": 1.4444, "step": 1528 }, { "epoch": 0.5235180826021828, "grad_norm": 0.22740307450294495, "learning_rate": 2.3818493150684935e-05, "loss": 1.6725, "step": 1529 }, { "epoch": 0.5238604750695485, "grad_norm": 0.22878222167491913, "learning_rate": 2.3801369863013702e-05, "loss": 1.6162, "step": 1530 }, { "epoch": 0.5242028675369141, "grad_norm": 0.22543762624263763, "learning_rate": 2.3784246575342468e-05, "loss": 1.6163, "step": 1531 }, { "epoch": 0.5245452600042799, "grad_norm": 0.2112475037574768, "learning_rate": 2.3767123287671235e-05, "loss": 1.6026, "step": 1532 }, { "epoch": 0.5248876524716456, "grad_norm": 0.23907679319381714, "learning_rate": 2.375e-05, "loss": 1.5217, "step": 1533 }, { "epoch": 0.5252300449390114, "grad_norm": 0.20684880018234253, "learning_rate": 2.3732876712328768e-05, "loss": 1.5196, "step": 1534 }, { "epoch": 0.525572437406377, "grad_norm": 0.23139870166778564, "learning_rate": 2.3715753424657534e-05, "loss": 1.6212, "step": 1535 }, { "epoch": 0.5259148298737428, "grad_norm": 0.24150854349136353, "learning_rate": 2.36986301369863e-05, "loss": 1.6938, "step": 1536 }, { "epoch": 0.5262572223411085, "grad_norm": 0.21386957168579102, "learning_rate": 2.3681506849315067e-05, "loss": 1.5113, "step": 1537 }, { "epoch": 0.5265996148084742, "grad_norm": 0.26949217915534973, "learning_rate": 2.3664383561643837e-05, "loss": 1.5812, "step": 1538 }, { "epoch": 0.52694200727584, "grad_norm": 0.2250927984714508, "learning_rate": 2.3647260273972604e-05, "loss": 1.7149, "step": 1539 }, { "epoch": 0.5272843997432056, "grad_norm": 0.2275020331144333, "learning_rate": 2.363013698630137e-05, "loss": 1.6151, "step": 1540 }, { "epoch": 0.5276267922105714, "grad_norm": 0.2361801266670227, "learning_rate": 2.3613013698630137e-05, "loss": 1.5673, "step": 1541 }, { "epoch": 0.5279691846779371, "grad_norm": 0.22164815664291382, "learning_rate": 2.3595890410958903e-05, "loss": 1.5368, "step": 1542 }, { "epoch": 0.5283115771453029, "grad_norm": 0.2048596292734146, "learning_rate": 2.3578767123287673e-05, "loss": 1.5647, "step": 1543 }, { "epoch": 0.5286539696126685, "grad_norm": 0.2020074427127838, "learning_rate": 2.356164383561644e-05, "loss": 1.6262, "step": 1544 }, { "epoch": 0.5289963620800342, "grad_norm": 0.2308894693851471, "learning_rate": 2.3544520547945206e-05, "loss": 1.509, "step": 1545 }, { "epoch": 0.5293387545474, "grad_norm": 0.23261059820652008, "learning_rate": 2.3527397260273973e-05, "loss": 1.6554, "step": 1546 }, { "epoch": 0.5296811470147657, "grad_norm": 0.2134784460067749, "learning_rate": 2.3510273972602743e-05, "loss": 1.56, "step": 1547 }, { "epoch": 0.5300235394821314, "grad_norm": 0.2147068828344345, "learning_rate": 2.349315068493151e-05, "loss": 1.5635, "step": 1548 }, { "epoch": 0.5303659319494971, "grad_norm": 0.2250356376171112, "learning_rate": 2.3476027397260276e-05, "loss": 1.5909, "step": 1549 }, { "epoch": 0.5307083244168628, "grad_norm": 0.24001911282539368, "learning_rate": 2.3458904109589042e-05, "loss": 1.559, "step": 1550 }, { "epoch": 0.5310507168842286, "grad_norm": 0.2418724149465561, "learning_rate": 2.344178082191781e-05, "loss": 1.5882, "step": 1551 }, { "epoch": 0.5313931093515942, "grad_norm": 0.21210859715938568, "learning_rate": 2.342465753424658e-05, "loss": 1.5873, "step": 1552 }, { "epoch": 0.53173550181896, "grad_norm": 0.222972571849823, "learning_rate": 2.3407534246575345e-05, "loss": 1.6808, "step": 1553 }, { "epoch": 0.5320778942863257, "grad_norm": 0.22712326049804688, "learning_rate": 2.3390410958904112e-05, "loss": 1.6093, "step": 1554 }, { "epoch": 0.5324202867536915, "grad_norm": 0.24129581451416016, "learning_rate": 2.337328767123288e-05, "loss": 1.6513, "step": 1555 }, { "epoch": 0.5327626792210571, "grad_norm": 0.2144259512424469, "learning_rate": 2.3356164383561645e-05, "loss": 1.5504, "step": 1556 }, { "epoch": 0.5331050716884228, "grad_norm": 0.21459950506687164, "learning_rate": 2.333904109589041e-05, "loss": 1.579, "step": 1557 }, { "epoch": 0.5334474641557886, "grad_norm": 0.2177567183971405, "learning_rate": 2.3321917808219178e-05, "loss": 1.5503, "step": 1558 }, { "epoch": 0.5337898566231543, "grad_norm": 0.22202295064926147, "learning_rate": 2.3304794520547944e-05, "loss": 1.6406, "step": 1559 }, { "epoch": 0.53413224909052, "grad_norm": 0.2196848839521408, "learning_rate": 2.328767123287671e-05, "loss": 1.5743, "step": 1560 }, { "epoch": 0.5344746415578857, "grad_norm": 0.21995379030704498, "learning_rate": 2.327054794520548e-05, "loss": 1.6322, "step": 1561 }, { "epoch": 0.5348170340252515, "grad_norm": 0.22813719511032104, "learning_rate": 2.3253424657534247e-05, "loss": 1.6145, "step": 1562 }, { "epoch": 0.5351594264926172, "grad_norm": 0.2533990144729614, "learning_rate": 2.3236301369863014e-05, "loss": 1.6752, "step": 1563 }, { "epoch": 0.5355018189599828, "grad_norm": 0.21823281049728394, "learning_rate": 2.321917808219178e-05, "loss": 1.5619, "step": 1564 }, { "epoch": 0.5358442114273486, "grad_norm": 0.2734806537628174, "learning_rate": 2.320205479452055e-05, "loss": 1.465, "step": 1565 }, { "epoch": 0.5361866038947143, "grad_norm": 0.24016733467578888, "learning_rate": 2.3184931506849317e-05, "loss": 1.579, "step": 1566 }, { "epoch": 0.5365289963620801, "grad_norm": 0.25029322504997253, "learning_rate": 2.3167808219178083e-05, "loss": 1.4907, "step": 1567 }, { "epoch": 0.5368713888294457, "grad_norm": 0.24213840067386627, "learning_rate": 2.315068493150685e-05, "loss": 1.6404, "step": 1568 }, { "epoch": 0.5372137812968115, "grad_norm": 0.24716584384441376, "learning_rate": 2.3133561643835616e-05, "loss": 1.6934, "step": 1569 }, { "epoch": 0.5375561737641772, "grad_norm": 0.2601795792579651, "learning_rate": 2.3116438356164386e-05, "loss": 1.6488, "step": 1570 }, { "epoch": 0.5378985662315429, "grad_norm": 0.23896238207817078, "learning_rate": 2.3099315068493153e-05, "loss": 1.5605, "step": 1571 }, { "epoch": 0.5382409586989086, "grad_norm": 0.2321603149175644, "learning_rate": 2.308219178082192e-05, "loss": 1.5559, "step": 1572 }, { "epoch": 0.5385833511662743, "grad_norm": 0.21941959857940674, "learning_rate": 2.3065068493150686e-05, "loss": 1.6103, "step": 1573 }, { "epoch": 0.5389257436336401, "grad_norm": 0.24602070450782776, "learning_rate": 2.3047945205479456e-05, "loss": 1.6158, "step": 1574 }, { "epoch": 0.5392681361010058, "grad_norm": 0.2338927686214447, "learning_rate": 2.3030821917808222e-05, "loss": 1.522, "step": 1575 }, { "epoch": 0.5396105285683716, "grad_norm": 0.24326220154762268, "learning_rate": 2.301369863013699e-05, "loss": 1.5927, "step": 1576 }, { "epoch": 0.5399529210357372, "grad_norm": 0.21912309527397156, "learning_rate": 2.2996575342465755e-05, "loss": 1.5363, "step": 1577 }, { "epoch": 0.5402953135031029, "grad_norm": 0.21627534925937653, "learning_rate": 2.2979452054794522e-05, "loss": 1.5219, "step": 1578 }, { "epoch": 0.5406377059704687, "grad_norm": 0.23941653966903687, "learning_rate": 2.296232876712329e-05, "loss": 1.5107, "step": 1579 }, { "epoch": 0.5409800984378343, "grad_norm": 0.22710943222045898, "learning_rate": 2.2945205479452055e-05, "loss": 1.6144, "step": 1580 }, { "epoch": 0.5413224909052001, "grad_norm": 0.22426354885101318, "learning_rate": 2.292808219178082e-05, "loss": 1.6067, "step": 1581 }, { "epoch": 0.5416648833725658, "grad_norm": 0.24455572664737701, "learning_rate": 2.2910958904109588e-05, "loss": 1.639, "step": 1582 }, { "epoch": 0.5420072758399315, "grad_norm": 0.24584674835205078, "learning_rate": 2.2893835616438358e-05, "loss": 1.6781, "step": 1583 }, { "epoch": 0.5423496683072973, "grad_norm": 0.23162931203842163, "learning_rate": 2.2876712328767124e-05, "loss": 1.6093, "step": 1584 }, { "epoch": 0.5426920607746629, "grad_norm": 0.26014482975006104, "learning_rate": 2.285958904109589e-05, "loss": 1.6474, "step": 1585 }, { "epoch": 0.5430344532420287, "grad_norm": 0.21688483655452728, "learning_rate": 2.2842465753424657e-05, "loss": 1.611, "step": 1586 }, { "epoch": 0.5433768457093944, "grad_norm": 0.2277059406042099, "learning_rate": 2.2825342465753427e-05, "loss": 1.6652, "step": 1587 }, { "epoch": 0.5437192381767602, "grad_norm": 0.2542749047279358, "learning_rate": 2.2808219178082194e-05, "loss": 1.5926, "step": 1588 }, { "epoch": 0.5440616306441258, "grad_norm": 0.24835966527462006, "learning_rate": 2.279109589041096e-05, "loss": 1.5776, "step": 1589 }, { "epoch": 0.5444040231114915, "grad_norm": 0.235756978392601, "learning_rate": 2.2773972602739727e-05, "loss": 1.6148, "step": 1590 }, { "epoch": 0.5447464155788573, "grad_norm": 0.2247740477323532, "learning_rate": 2.2756849315068493e-05, "loss": 1.6238, "step": 1591 }, { "epoch": 0.545088808046223, "grad_norm": 0.265891432762146, "learning_rate": 2.2739726027397263e-05, "loss": 1.6642, "step": 1592 }, { "epoch": 0.5454312005135887, "grad_norm": 0.23127974569797516, "learning_rate": 2.272260273972603e-05, "loss": 1.5998, "step": 1593 }, { "epoch": 0.5457735929809544, "grad_norm": 0.23566856980323792, "learning_rate": 2.2705479452054796e-05, "loss": 1.628, "step": 1594 }, { "epoch": 0.5461159854483202, "grad_norm": 0.23019731044769287, "learning_rate": 2.2688356164383563e-05, "loss": 1.5357, "step": 1595 }, { "epoch": 0.5464583779156859, "grad_norm": 0.2261570692062378, "learning_rate": 2.267123287671233e-05, "loss": 1.5928, "step": 1596 }, { "epoch": 0.5468007703830515, "grad_norm": 0.23164141178131104, "learning_rate": 2.2654109589041096e-05, "loss": 1.5353, "step": 1597 }, { "epoch": 0.5471431628504173, "grad_norm": 0.22104336321353912, "learning_rate": 2.2636986301369862e-05, "loss": 1.5425, "step": 1598 }, { "epoch": 0.547485555317783, "grad_norm": 0.24000659584999084, "learning_rate": 2.261986301369863e-05, "loss": 1.5547, "step": 1599 }, { "epoch": 0.5478279477851488, "grad_norm": 0.23938530683517456, "learning_rate": 2.2602739726027396e-05, "loss": 1.5574, "step": 1600 }, { "epoch": 0.5481703402525144, "grad_norm": 0.24615444242954254, "learning_rate": 2.2585616438356165e-05, "loss": 1.4962, "step": 1601 }, { "epoch": 0.5485127327198802, "grad_norm": 0.23723189532756805, "learning_rate": 2.2568493150684932e-05, "loss": 1.6227, "step": 1602 }, { "epoch": 0.5488551251872459, "grad_norm": 0.2396005243062973, "learning_rate": 2.25513698630137e-05, "loss": 1.584, "step": 1603 }, { "epoch": 0.5491975176546116, "grad_norm": 0.2152239829301834, "learning_rate": 2.2534246575342465e-05, "loss": 1.6169, "step": 1604 }, { "epoch": 0.5495399101219773, "grad_norm": 0.22672045230865479, "learning_rate": 2.2517123287671235e-05, "loss": 1.6509, "step": 1605 }, { "epoch": 0.549882302589343, "grad_norm": 0.22216498851776123, "learning_rate": 2.25e-05, "loss": 1.5816, "step": 1606 }, { "epoch": 0.5502246950567088, "grad_norm": 0.21737933158874512, "learning_rate": 2.2482876712328768e-05, "loss": 1.6158, "step": 1607 }, { "epoch": 0.5505670875240745, "grad_norm": 0.22094932198524475, "learning_rate": 2.2465753424657534e-05, "loss": 1.633, "step": 1608 }, { "epoch": 0.5509094799914402, "grad_norm": 0.231331467628479, "learning_rate": 2.24486301369863e-05, "loss": 1.6666, "step": 1609 }, { "epoch": 0.5512518724588059, "grad_norm": 0.21686817705631256, "learning_rate": 2.243150684931507e-05, "loss": 1.4728, "step": 1610 }, { "epoch": 0.5515942649261716, "grad_norm": 0.23325316607952118, "learning_rate": 2.2414383561643837e-05, "loss": 1.6902, "step": 1611 }, { "epoch": 0.5519366573935374, "grad_norm": 0.23317265510559082, "learning_rate": 2.2397260273972604e-05, "loss": 1.6351, "step": 1612 }, { "epoch": 0.552279049860903, "grad_norm": 0.22096872329711914, "learning_rate": 2.238013698630137e-05, "loss": 1.6516, "step": 1613 }, { "epoch": 0.5526214423282688, "grad_norm": 0.22047415375709534, "learning_rate": 2.236301369863014e-05, "loss": 1.6298, "step": 1614 }, { "epoch": 0.5529638347956345, "grad_norm": 0.22719867527484894, "learning_rate": 2.2345890410958907e-05, "loss": 1.6827, "step": 1615 }, { "epoch": 0.5533062272630002, "grad_norm": 0.23787765204906464, "learning_rate": 2.2328767123287673e-05, "loss": 1.6088, "step": 1616 }, { "epoch": 0.5536486197303659, "grad_norm": 0.21341276168823242, "learning_rate": 2.231164383561644e-05, "loss": 1.505, "step": 1617 }, { "epoch": 0.5539910121977316, "grad_norm": 0.23238500952720642, "learning_rate": 2.2294520547945206e-05, "loss": 1.6278, "step": 1618 }, { "epoch": 0.5543334046650974, "grad_norm": 0.2351275384426117, "learning_rate": 2.2277397260273973e-05, "loss": 1.6487, "step": 1619 }, { "epoch": 0.5546757971324631, "grad_norm": 0.22929498553276062, "learning_rate": 2.226027397260274e-05, "loss": 1.5262, "step": 1620 }, { "epoch": 0.5550181895998288, "grad_norm": 0.22353193163871765, "learning_rate": 2.2243150684931506e-05, "loss": 1.6125, "step": 1621 }, { "epoch": 0.5553605820671945, "grad_norm": 0.2138550877571106, "learning_rate": 2.2226027397260273e-05, "loss": 1.6025, "step": 1622 }, { "epoch": 0.5557029745345602, "grad_norm": 0.2425224632024765, "learning_rate": 2.2208904109589042e-05, "loss": 1.643, "step": 1623 }, { "epoch": 0.556045367001926, "grad_norm": 0.25362884998321533, "learning_rate": 2.219178082191781e-05, "loss": 1.5518, "step": 1624 }, { "epoch": 0.5563877594692916, "grad_norm": 0.2546079456806183, "learning_rate": 2.2174657534246575e-05, "loss": 1.6091, "step": 1625 }, { "epoch": 0.5567301519366574, "grad_norm": 0.234421506524086, "learning_rate": 2.2157534246575342e-05, "loss": 1.6, "step": 1626 }, { "epoch": 0.5570725444040231, "grad_norm": 0.2165205478668213, "learning_rate": 2.2140410958904112e-05, "loss": 1.5441, "step": 1627 }, { "epoch": 0.5574149368713889, "grad_norm": 0.22742505371570587, "learning_rate": 2.212328767123288e-05, "loss": 1.4956, "step": 1628 }, { "epoch": 0.5577573293387545, "grad_norm": 0.2368532121181488, "learning_rate": 2.2106164383561645e-05, "loss": 1.5536, "step": 1629 }, { "epoch": 0.5580997218061202, "grad_norm": 0.23830032348632812, "learning_rate": 2.208904109589041e-05, "loss": 1.6879, "step": 1630 }, { "epoch": 0.558442114273486, "grad_norm": 0.2280074805021286, "learning_rate": 2.2071917808219178e-05, "loss": 1.4739, "step": 1631 }, { "epoch": 0.5587845067408517, "grad_norm": 0.23410961031913757, "learning_rate": 2.2054794520547948e-05, "loss": 1.6141, "step": 1632 }, { "epoch": 0.5591268992082175, "grad_norm": 0.24126195907592773, "learning_rate": 2.2037671232876714e-05, "loss": 1.5059, "step": 1633 }, { "epoch": 0.5594692916755831, "grad_norm": 0.21702978014945984, "learning_rate": 2.202054794520548e-05, "loss": 1.5767, "step": 1634 }, { "epoch": 0.5598116841429489, "grad_norm": 0.22479255497455597, "learning_rate": 2.2003424657534247e-05, "loss": 1.6049, "step": 1635 }, { "epoch": 0.5601540766103146, "grad_norm": 0.2325708270072937, "learning_rate": 2.1986301369863017e-05, "loss": 1.6168, "step": 1636 }, { "epoch": 0.5604964690776802, "grad_norm": 0.25178098678588867, "learning_rate": 2.1969178082191784e-05, "loss": 1.5299, "step": 1637 }, { "epoch": 0.560838861545046, "grad_norm": 0.2344963401556015, "learning_rate": 2.195205479452055e-05, "loss": 1.6857, "step": 1638 }, { "epoch": 0.5611812540124117, "grad_norm": 0.2668394446372986, "learning_rate": 2.1934931506849317e-05, "loss": 1.6641, "step": 1639 }, { "epoch": 0.5615236464797775, "grad_norm": 0.231542706489563, "learning_rate": 2.1917808219178083e-05, "loss": 1.5496, "step": 1640 }, { "epoch": 0.5618660389471432, "grad_norm": 0.228193461894989, "learning_rate": 2.190068493150685e-05, "loss": 1.5663, "step": 1641 }, { "epoch": 0.5622084314145089, "grad_norm": 0.27183473110198975, "learning_rate": 2.1883561643835617e-05, "loss": 1.5716, "step": 1642 }, { "epoch": 0.5625508238818746, "grad_norm": 0.2290869802236557, "learning_rate": 2.1866438356164383e-05, "loss": 1.5253, "step": 1643 }, { "epoch": 0.5628932163492403, "grad_norm": 0.23135529458522797, "learning_rate": 2.184931506849315e-05, "loss": 1.6526, "step": 1644 }, { "epoch": 0.5632356088166061, "grad_norm": 0.23143847286701202, "learning_rate": 2.183219178082192e-05, "loss": 1.597, "step": 1645 }, { "epoch": 0.5635780012839717, "grad_norm": 0.2501331865787506, "learning_rate": 2.1815068493150686e-05, "loss": 1.6487, "step": 1646 }, { "epoch": 0.5639203937513375, "grad_norm": 0.25014063715934753, "learning_rate": 2.1797945205479453e-05, "loss": 1.5521, "step": 1647 }, { "epoch": 0.5642627862187032, "grad_norm": 0.2827690839767456, "learning_rate": 2.178082191780822e-05, "loss": 1.6047, "step": 1648 }, { "epoch": 0.5646051786860689, "grad_norm": 0.23569181561470032, "learning_rate": 2.1763698630136986e-05, "loss": 1.532, "step": 1649 }, { "epoch": 0.5649475711534346, "grad_norm": 0.2559454143047333, "learning_rate": 2.1746575342465755e-05, "loss": 1.4877, "step": 1650 }, { "epoch": 0.5652899636208003, "grad_norm": 0.23591335117816925, "learning_rate": 2.1729452054794522e-05, "loss": 1.6206, "step": 1651 }, { "epoch": 0.5656323560881661, "grad_norm": 0.23772075772285461, "learning_rate": 2.171232876712329e-05, "loss": 1.5534, "step": 1652 }, { "epoch": 0.5659747485555318, "grad_norm": 0.25165796279907227, "learning_rate": 2.1695205479452055e-05, "loss": 1.5819, "step": 1653 }, { "epoch": 0.5663171410228975, "grad_norm": 0.23782005906105042, "learning_rate": 2.1678082191780825e-05, "loss": 1.6155, "step": 1654 }, { "epoch": 0.5666595334902632, "grad_norm": 0.21985334157943726, "learning_rate": 2.166095890410959e-05, "loss": 1.6063, "step": 1655 }, { "epoch": 0.5670019259576289, "grad_norm": 0.23752830922603607, "learning_rate": 2.1643835616438358e-05, "loss": 1.6375, "step": 1656 }, { "epoch": 0.5673443184249947, "grad_norm": 0.23364311456680298, "learning_rate": 2.1626712328767125e-05, "loss": 1.5727, "step": 1657 }, { "epoch": 0.5676867108923603, "grad_norm": 0.21874487400054932, "learning_rate": 2.160958904109589e-05, "loss": 1.4759, "step": 1658 }, { "epoch": 0.5680291033597261, "grad_norm": 0.2211999148130417, "learning_rate": 2.159246575342466e-05, "loss": 1.5244, "step": 1659 }, { "epoch": 0.5683714958270918, "grad_norm": 0.2445869743824005, "learning_rate": 2.1575342465753427e-05, "loss": 1.6532, "step": 1660 }, { "epoch": 0.5687138882944576, "grad_norm": 0.25994980335235596, "learning_rate": 2.1558219178082194e-05, "loss": 1.6284, "step": 1661 }, { "epoch": 0.5690562807618232, "grad_norm": 0.22349977493286133, "learning_rate": 2.154109589041096e-05, "loss": 1.6526, "step": 1662 }, { "epoch": 0.5693986732291889, "grad_norm": 0.21756146848201752, "learning_rate": 2.1523972602739727e-05, "loss": 1.5736, "step": 1663 }, { "epoch": 0.5697410656965547, "grad_norm": 0.21431857347488403, "learning_rate": 2.1506849315068494e-05, "loss": 1.5261, "step": 1664 }, { "epoch": 0.5700834581639204, "grad_norm": 0.22787094116210938, "learning_rate": 2.148972602739726e-05, "loss": 1.6088, "step": 1665 }, { "epoch": 0.5704258506312861, "grad_norm": 0.22708512842655182, "learning_rate": 2.1472602739726027e-05, "loss": 1.6461, "step": 1666 }, { "epoch": 0.5707682430986518, "grad_norm": 0.268706351518631, "learning_rate": 2.1455479452054796e-05, "loss": 1.5333, "step": 1667 }, { "epoch": 0.5711106355660176, "grad_norm": 0.25398948788642883, "learning_rate": 2.1438356164383563e-05, "loss": 1.6275, "step": 1668 }, { "epoch": 0.5714530280333833, "grad_norm": 0.24876363575458527, "learning_rate": 2.142123287671233e-05, "loss": 1.5336, "step": 1669 }, { "epoch": 0.5717954205007489, "grad_norm": 0.24159543216228485, "learning_rate": 2.1404109589041096e-05, "loss": 1.6071, "step": 1670 }, { "epoch": 0.5721378129681147, "grad_norm": 0.2872893214225769, "learning_rate": 2.1386986301369863e-05, "loss": 1.6682, "step": 1671 }, { "epoch": 0.5724802054354804, "grad_norm": 0.2469603717327118, "learning_rate": 2.1369863013698632e-05, "loss": 1.4707, "step": 1672 }, { "epoch": 0.5728225979028462, "grad_norm": 0.8686286807060242, "learning_rate": 2.13527397260274e-05, "loss": 1.6436, "step": 1673 }, { "epoch": 0.5731649903702118, "grad_norm": 0.23092065751552582, "learning_rate": 2.1335616438356166e-05, "loss": 1.5557, "step": 1674 }, { "epoch": 0.5735073828375776, "grad_norm": 0.23327715694904327, "learning_rate": 2.1318493150684932e-05, "loss": 1.5348, "step": 1675 }, { "epoch": 0.5738497753049433, "grad_norm": 0.24124957621097565, "learning_rate": 2.1301369863013702e-05, "loss": 1.5999, "step": 1676 }, { "epoch": 0.574192167772309, "grad_norm": 0.24998849630355835, "learning_rate": 2.128424657534247e-05, "loss": 1.5324, "step": 1677 }, { "epoch": 0.5745345602396748, "grad_norm": 0.2444167584180832, "learning_rate": 2.1267123287671235e-05, "loss": 1.7003, "step": 1678 }, { "epoch": 0.5748769527070404, "grad_norm": 0.26832059025764465, "learning_rate": 2.125e-05, "loss": 1.6925, "step": 1679 }, { "epoch": 0.5752193451744062, "grad_norm": 0.2453073412179947, "learning_rate": 2.1232876712328768e-05, "loss": 1.6584, "step": 1680 }, { "epoch": 0.5755617376417719, "grad_norm": 0.24739161133766174, "learning_rate": 2.1215753424657535e-05, "loss": 1.6677, "step": 1681 }, { "epoch": 0.5759041301091375, "grad_norm": 0.22297917306423187, "learning_rate": 2.11986301369863e-05, "loss": 1.5112, "step": 1682 }, { "epoch": 0.5762465225765033, "grad_norm": 0.258510023355484, "learning_rate": 2.1181506849315068e-05, "loss": 1.5625, "step": 1683 }, { "epoch": 0.576588915043869, "grad_norm": 0.22591592371463776, "learning_rate": 2.1164383561643834e-05, "loss": 1.5523, "step": 1684 }, { "epoch": 0.5769313075112348, "grad_norm": 0.22260737419128418, "learning_rate": 2.1147260273972604e-05, "loss": 1.6311, "step": 1685 }, { "epoch": 0.5772736999786005, "grad_norm": 0.26006972789764404, "learning_rate": 2.113013698630137e-05, "loss": 1.6981, "step": 1686 }, { "epoch": 0.5776160924459662, "grad_norm": 0.2197074443101883, "learning_rate": 2.1113013698630137e-05, "loss": 1.6459, "step": 1687 }, { "epoch": 0.5779584849133319, "grad_norm": 0.21017788350582123, "learning_rate": 2.1095890410958904e-05, "loss": 1.6427, "step": 1688 }, { "epoch": 0.5783008773806976, "grad_norm": 0.24418772757053375, "learning_rate": 2.107876712328767e-05, "loss": 1.7428, "step": 1689 }, { "epoch": 0.5786432698480634, "grad_norm": 0.22280390560626984, "learning_rate": 2.106164383561644e-05, "loss": 1.6345, "step": 1690 }, { "epoch": 0.578985662315429, "grad_norm": 0.222660094499588, "learning_rate": 2.1044520547945207e-05, "loss": 1.5252, "step": 1691 }, { "epoch": 0.5793280547827948, "grad_norm": 0.2321695238351822, "learning_rate": 2.1027397260273973e-05, "loss": 1.6233, "step": 1692 }, { "epoch": 0.5796704472501605, "grad_norm": 0.22909484803676605, "learning_rate": 2.101027397260274e-05, "loss": 1.6, "step": 1693 }, { "epoch": 0.5800128397175263, "grad_norm": 0.2520410716533661, "learning_rate": 2.099315068493151e-05, "loss": 1.6014, "step": 1694 }, { "epoch": 0.5803552321848919, "grad_norm": 0.23817850649356842, "learning_rate": 2.0976027397260276e-05, "loss": 1.6092, "step": 1695 }, { "epoch": 0.5806976246522576, "grad_norm": 0.2280145287513733, "learning_rate": 2.0958904109589043e-05, "loss": 1.5247, "step": 1696 }, { "epoch": 0.5810400171196234, "grad_norm": 0.25138744711875916, "learning_rate": 2.094178082191781e-05, "loss": 1.5322, "step": 1697 }, { "epoch": 0.5813824095869891, "grad_norm": 0.23993705213069916, "learning_rate": 2.0924657534246576e-05, "loss": 1.5329, "step": 1698 }, { "epoch": 0.5817248020543548, "grad_norm": 0.2618987560272217, "learning_rate": 2.0907534246575346e-05, "loss": 1.5974, "step": 1699 }, { "epoch": 0.5820671945217205, "grad_norm": 0.2340169996023178, "learning_rate": 2.0890410958904112e-05, "loss": 1.6458, "step": 1700 }, { "epoch": 0.5824095869890863, "grad_norm": 0.2398090660572052, "learning_rate": 2.087328767123288e-05, "loss": 1.6567, "step": 1701 }, { "epoch": 0.582751979456452, "grad_norm": 0.23395468294620514, "learning_rate": 2.0856164383561645e-05, "loss": 1.5079, "step": 1702 }, { "epoch": 0.5830943719238176, "grad_norm": 0.21438083052635193, "learning_rate": 2.083904109589041e-05, "loss": 1.4835, "step": 1703 }, { "epoch": 0.5834367643911834, "grad_norm": 0.2321261465549469, "learning_rate": 2.0821917808219178e-05, "loss": 1.5569, "step": 1704 }, { "epoch": 0.5837791568585491, "grad_norm": 0.3524865508079529, "learning_rate": 2.0804794520547945e-05, "loss": 1.5997, "step": 1705 }, { "epoch": 0.5841215493259149, "grad_norm": 0.277817964553833, "learning_rate": 2.078767123287671e-05, "loss": 1.6421, "step": 1706 }, { "epoch": 0.5844639417932805, "grad_norm": 0.2354702353477478, "learning_rate": 2.0770547945205478e-05, "loss": 1.5413, "step": 1707 }, { "epoch": 0.5848063342606463, "grad_norm": 0.2252565622329712, "learning_rate": 2.0753424657534248e-05, "loss": 1.482, "step": 1708 }, { "epoch": 0.585148726728012, "grad_norm": 0.22761493921279907, "learning_rate": 2.0736301369863014e-05, "loss": 1.6198, "step": 1709 }, { "epoch": 0.5854911191953777, "grad_norm": 0.25308796763420105, "learning_rate": 2.071917808219178e-05, "loss": 1.6222, "step": 1710 }, { "epoch": 0.5858335116627434, "grad_norm": 0.24306835234165192, "learning_rate": 2.0702054794520547e-05, "loss": 1.5888, "step": 1711 }, { "epoch": 0.5861759041301091, "grad_norm": 0.23894590139389038, "learning_rate": 2.0684931506849317e-05, "loss": 1.5329, "step": 1712 }, { "epoch": 0.5865182965974749, "grad_norm": 0.22560924291610718, "learning_rate": 2.0667808219178084e-05, "loss": 1.557, "step": 1713 }, { "epoch": 0.5868606890648406, "grad_norm": 0.23642420768737793, "learning_rate": 2.065068493150685e-05, "loss": 1.6202, "step": 1714 }, { "epoch": 0.5872030815322062, "grad_norm": 0.21955035626888275, "learning_rate": 2.0633561643835617e-05, "loss": 1.6144, "step": 1715 }, { "epoch": 0.587545473999572, "grad_norm": 0.2359061986207962, "learning_rate": 2.0616438356164387e-05, "loss": 1.5832, "step": 1716 }, { "epoch": 0.5878878664669377, "grad_norm": 0.2260013073682785, "learning_rate": 2.0599315068493153e-05, "loss": 1.5829, "step": 1717 }, { "epoch": 0.5882302589343035, "grad_norm": 0.25096362829208374, "learning_rate": 2.058219178082192e-05, "loss": 1.5997, "step": 1718 }, { "epoch": 0.5885726514016691, "grad_norm": 0.27022090554237366, "learning_rate": 2.0565068493150686e-05, "loss": 1.4997, "step": 1719 }, { "epoch": 0.5889150438690349, "grad_norm": 0.23309415578842163, "learning_rate": 2.0547945205479453e-05, "loss": 1.5975, "step": 1720 }, { "epoch": 0.5892574363364006, "grad_norm": 0.2234116792678833, "learning_rate": 2.0530821917808223e-05, "loss": 1.6426, "step": 1721 }, { "epoch": 0.5895998288037663, "grad_norm": 0.22296760976314545, "learning_rate": 2.051369863013699e-05, "loss": 1.5451, "step": 1722 }, { "epoch": 0.589942221271132, "grad_norm": 0.25106799602508545, "learning_rate": 2.0496575342465756e-05, "loss": 1.7189, "step": 1723 }, { "epoch": 0.5902846137384977, "grad_norm": 0.24618655443191528, "learning_rate": 2.0479452054794522e-05, "loss": 1.5492, "step": 1724 }, { "epoch": 0.5906270062058635, "grad_norm": 0.26508647203445435, "learning_rate": 2.046232876712329e-05, "loss": 1.6561, "step": 1725 }, { "epoch": 0.5909693986732292, "grad_norm": 0.25562623143196106, "learning_rate": 2.0445205479452055e-05, "loss": 1.6262, "step": 1726 }, { "epoch": 0.591311791140595, "grad_norm": 0.23754583299160004, "learning_rate": 2.042808219178082e-05, "loss": 1.4742, "step": 1727 }, { "epoch": 0.5916541836079606, "grad_norm": 0.2608672082424164, "learning_rate": 2.0410958904109588e-05, "loss": 1.6641, "step": 1728 }, { "epoch": 0.5919965760753263, "grad_norm": 0.2530585825443268, "learning_rate": 2.0393835616438355e-05, "loss": 1.6753, "step": 1729 }, { "epoch": 0.5923389685426921, "grad_norm": 0.24460148811340332, "learning_rate": 2.0376712328767125e-05, "loss": 1.6719, "step": 1730 }, { "epoch": 0.5926813610100578, "grad_norm": 0.2734682261943817, "learning_rate": 2.035958904109589e-05, "loss": 1.5038, "step": 1731 }, { "epoch": 0.5930237534774235, "grad_norm": 0.24666248261928558, "learning_rate": 2.0342465753424658e-05, "loss": 1.6623, "step": 1732 }, { "epoch": 0.5933661459447892, "grad_norm": 0.2569672465324402, "learning_rate": 2.0325342465753424e-05, "loss": 1.6857, "step": 1733 }, { "epoch": 0.593708538412155, "grad_norm": 0.2768937647342682, "learning_rate": 2.0308219178082194e-05, "loss": 1.5812, "step": 1734 }, { "epoch": 0.5940509308795207, "grad_norm": 0.26139214634895325, "learning_rate": 2.029109589041096e-05, "loss": 1.6296, "step": 1735 }, { "epoch": 0.5943933233468863, "grad_norm": 0.23963962495326996, "learning_rate": 2.0273972602739727e-05, "loss": 1.5257, "step": 1736 }, { "epoch": 0.5947357158142521, "grad_norm": 0.2407197654247284, "learning_rate": 2.0256849315068494e-05, "loss": 1.6648, "step": 1737 }, { "epoch": 0.5950781082816178, "grad_norm": 0.2369339019060135, "learning_rate": 2.023972602739726e-05, "loss": 1.6312, "step": 1738 }, { "epoch": 0.5954205007489836, "grad_norm": 0.2492443472146988, "learning_rate": 2.022260273972603e-05, "loss": 1.6533, "step": 1739 }, { "epoch": 0.5957628932163492, "grad_norm": 0.23267079889774323, "learning_rate": 2.0205479452054797e-05, "loss": 1.5349, "step": 1740 }, { "epoch": 0.5961052856837149, "grad_norm": 0.23505039513111115, "learning_rate": 2.0188356164383563e-05, "loss": 1.5612, "step": 1741 }, { "epoch": 0.5964476781510807, "grad_norm": 0.27613335847854614, "learning_rate": 2.017123287671233e-05, "loss": 1.5859, "step": 1742 }, { "epoch": 0.5967900706184464, "grad_norm": 0.2289562076330185, "learning_rate": 2.01541095890411e-05, "loss": 1.5542, "step": 1743 }, { "epoch": 0.5971324630858121, "grad_norm": 0.25808125734329224, "learning_rate": 2.0136986301369866e-05, "loss": 1.5304, "step": 1744 }, { "epoch": 0.5974748555531778, "grad_norm": 0.2184978574514389, "learning_rate": 2.0119863013698633e-05, "loss": 1.5382, "step": 1745 }, { "epoch": 0.5978172480205436, "grad_norm": 0.23583436012268066, "learning_rate": 2.01027397260274e-05, "loss": 1.5507, "step": 1746 }, { "epoch": 0.5981596404879093, "grad_norm": 0.24803462624549866, "learning_rate": 2.0085616438356166e-05, "loss": 1.6006, "step": 1747 }, { "epoch": 0.5985020329552749, "grad_norm": 0.24200530350208282, "learning_rate": 2.0068493150684932e-05, "loss": 1.5516, "step": 1748 }, { "epoch": 0.5988444254226407, "grad_norm": 0.2235562950372696, "learning_rate": 2.00513698630137e-05, "loss": 1.6294, "step": 1749 }, { "epoch": 0.5991868178900064, "grad_norm": 0.25530868768692017, "learning_rate": 2.0034246575342465e-05, "loss": 1.6087, "step": 1750 }, { "epoch": 0.5995292103573722, "grad_norm": 0.23145397007465363, "learning_rate": 2.0017123287671232e-05, "loss": 1.6526, "step": 1751 }, { "epoch": 0.5998716028247378, "grad_norm": 0.23851977288722992, "learning_rate": 2e-05, "loss": 1.5916, "step": 1752 }, { "epoch": 0.6002139952921036, "grad_norm": 0.2510264813899994, "learning_rate": 1.9982876712328768e-05, "loss": 1.6222, "step": 1753 }, { "epoch": 0.6005563877594693, "grad_norm": 0.23706255853176117, "learning_rate": 1.9965753424657535e-05, "loss": 1.6308, "step": 1754 }, { "epoch": 0.600898780226835, "grad_norm": 0.22687005996704102, "learning_rate": 1.99486301369863e-05, "loss": 1.5797, "step": 1755 }, { "epoch": 0.6012411726942007, "grad_norm": 0.2359609752893448, "learning_rate": 1.9931506849315068e-05, "loss": 1.5721, "step": 1756 }, { "epoch": 0.6015835651615664, "grad_norm": 0.259606271982193, "learning_rate": 1.9914383561643838e-05, "loss": 1.5119, "step": 1757 }, { "epoch": 0.6019259576289322, "grad_norm": 0.2623370587825775, "learning_rate": 1.9897260273972604e-05, "loss": 1.6639, "step": 1758 }, { "epoch": 0.6022683500962979, "grad_norm": 0.2837326228618622, "learning_rate": 1.988013698630137e-05, "loss": 1.5955, "step": 1759 }, { "epoch": 0.6026107425636636, "grad_norm": 0.23047137260437012, "learning_rate": 1.9863013698630137e-05, "loss": 1.5249, "step": 1760 }, { "epoch": 0.6029531350310293, "grad_norm": 0.24675370752811432, "learning_rate": 1.9845890410958907e-05, "loss": 1.4909, "step": 1761 }, { "epoch": 0.603295527498395, "grad_norm": 0.25511303544044495, "learning_rate": 1.9828767123287674e-05, "loss": 1.551, "step": 1762 }, { "epoch": 0.6036379199657608, "grad_norm": 0.24139998853206635, "learning_rate": 1.981164383561644e-05, "loss": 1.6687, "step": 1763 }, { "epoch": 0.6039803124331264, "grad_norm": 0.23259371519088745, "learning_rate": 1.9794520547945207e-05, "loss": 1.5247, "step": 1764 }, { "epoch": 0.6043227049004922, "grad_norm": 0.2193070948123932, "learning_rate": 1.9777397260273973e-05, "loss": 1.4931, "step": 1765 }, { "epoch": 0.6046650973678579, "grad_norm": 0.23372416198253632, "learning_rate": 1.976027397260274e-05, "loss": 1.6964, "step": 1766 }, { "epoch": 0.6050074898352237, "grad_norm": 0.30688926577568054, "learning_rate": 1.9743150684931506e-05, "loss": 1.4548, "step": 1767 }, { "epoch": 0.6053498823025893, "grad_norm": 0.24762889742851257, "learning_rate": 1.9726027397260273e-05, "loss": 1.6176, "step": 1768 }, { "epoch": 0.605692274769955, "grad_norm": 0.2475319504737854, "learning_rate": 1.970890410958904e-05, "loss": 1.6454, "step": 1769 }, { "epoch": 0.6060346672373208, "grad_norm": 0.2472064197063446, "learning_rate": 1.969178082191781e-05, "loss": 1.6314, "step": 1770 }, { "epoch": 0.6063770597046865, "grad_norm": 0.23988324403762817, "learning_rate": 1.9674657534246576e-05, "loss": 1.5451, "step": 1771 }, { "epoch": 0.6067194521720523, "grad_norm": 0.24826005101203918, "learning_rate": 1.9657534246575342e-05, "loss": 1.5355, "step": 1772 }, { "epoch": 0.6070618446394179, "grad_norm": 0.25442853569984436, "learning_rate": 1.964041095890411e-05, "loss": 1.6182, "step": 1773 }, { "epoch": 0.6074042371067836, "grad_norm": 0.2395102083683014, "learning_rate": 1.962328767123288e-05, "loss": 1.6203, "step": 1774 }, { "epoch": 0.6077466295741494, "grad_norm": 0.24093173444271088, "learning_rate": 1.9606164383561645e-05, "loss": 1.4512, "step": 1775 }, { "epoch": 0.608089022041515, "grad_norm": 0.26883599162101746, "learning_rate": 1.9589041095890412e-05, "loss": 1.6656, "step": 1776 }, { "epoch": 0.6084314145088808, "grad_norm": 0.23213540017604828, "learning_rate": 1.9571917808219178e-05, "loss": 1.7223, "step": 1777 }, { "epoch": 0.6087738069762465, "grad_norm": 0.24096937477588654, "learning_rate": 1.9554794520547945e-05, "loss": 1.5437, "step": 1778 }, { "epoch": 0.6091161994436123, "grad_norm": 0.25816965103149414, "learning_rate": 1.9537671232876715e-05, "loss": 1.73, "step": 1779 }, { "epoch": 0.609458591910978, "grad_norm": 0.2556367516517639, "learning_rate": 1.952054794520548e-05, "loss": 1.7045, "step": 1780 }, { "epoch": 0.6098009843783436, "grad_norm": 0.2755521237850189, "learning_rate": 1.9503424657534248e-05, "loss": 1.5767, "step": 1781 }, { "epoch": 0.6101433768457094, "grad_norm": 0.2457827627658844, "learning_rate": 1.9486301369863014e-05, "loss": 1.5396, "step": 1782 }, { "epoch": 0.6104857693130751, "grad_norm": 0.23500844836235046, "learning_rate": 1.9469178082191784e-05, "loss": 1.6036, "step": 1783 }, { "epoch": 0.6108281617804409, "grad_norm": 0.24470128118991852, "learning_rate": 1.945205479452055e-05, "loss": 1.6169, "step": 1784 }, { "epoch": 0.6111705542478065, "grad_norm": 0.2550033926963806, "learning_rate": 1.9434931506849317e-05, "loss": 1.652, "step": 1785 }, { "epoch": 0.6115129467151723, "grad_norm": 0.26433706283569336, "learning_rate": 1.9417808219178084e-05, "loss": 1.5586, "step": 1786 }, { "epoch": 0.611855339182538, "grad_norm": 0.25471803545951843, "learning_rate": 1.940068493150685e-05, "loss": 1.4228, "step": 1787 }, { "epoch": 0.6121977316499037, "grad_norm": 0.23418140411376953, "learning_rate": 1.9383561643835617e-05, "loss": 1.5236, "step": 1788 }, { "epoch": 0.6125401241172694, "grad_norm": 0.2467116415500641, "learning_rate": 1.9366438356164383e-05, "loss": 1.6175, "step": 1789 }, { "epoch": 0.6128825165846351, "grad_norm": 0.25441110134124756, "learning_rate": 1.934931506849315e-05, "loss": 1.5781, "step": 1790 }, { "epoch": 0.6132249090520009, "grad_norm": 0.24490486085414886, "learning_rate": 1.9332191780821916e-05, "loss": 1.4628, "step": 1791 }, { "epoch": 0.6135673015193666, "grad_norm": 0.29244402050971985, "learning_rate": 1.9315068493150686e-05, "loss": 1.5155, "step": 1792 }, { "epoch": 0.6139096939867323, "grad_norm": 0.28036418557167053, "learning_rate": 1.9297945205479453e-05, "loss": 1.6028, "step": 1793 }, { "epoch": 0.614252086454098, "grad_norm": 0.2469927966594696, "learning_rate": 1.928082191780822e-05, "loss": 1.6074, "step": 1794 }, { "epoch": 0.6145944789214637, "grad_norm": 0.2774472236633301, "learning_rate": 1.9263698630136986e-05, "loss": 1.6861, "step": 1795 }, { "epoch": 0.6149368713888295, "grad_norm": 0.23863071203231812, "learning_rate": 1.9246575342465752e-05, "loss": 1.4958, "step": 1796 }, { "epoch": 0.6152792638561951, "grad_norm": 0.23517407476902008, "learning_rate": 1.9229452054794522e-05, "loss": 1.6255, "step": 1797 }, { "epoch": 0.6156216563235609, "grad_norm": 0.26413553953170776, "learning_rate": 1.921232876712329e-05, "loss": 1.514, "step": 1798 }, { "epoch": 0.6159640487909266, "grad_norm": 0.25762268900871277, "learning_rate": 1.9195205479452055e-05, "loss": 1.5556, "step": 1799 }, { "epoch": 0.6163064412582924, "grad_norm": 0.25708678364753723, "learning_rate": 1.9178082191780822e-05, "loss": 1.6231, "step": 1800 }, { "epoch": 0.616648833725658, "grad_norm": 0.22862407565116882, "learning_rate": 1.916095890410959e-05, "loss": 1.6102, "step": 1801 }, { "epoch": 0.6169912261930237, "grad_norm": 0.25049862265586853, "learning_rate": 1.9143835616438358e-05, "loss": 1.5546, "step": 1802 }, { "epoch": 0.6173336186603895, "grad_norm": 0.2401273548603058, "learning_rate": 1.9126712328767125e-05, "loss": 1.577, "step": 1803 }, { "epoch": 0.6176760111277552, "grad_norm": 0.24142712354660034, "learning_rate": 1.910958904109589e-05, "loss": 1.4948, "step": 1804 }, { "epoch": 0.618018403595121, "grad_norm": 0.2279348224401474, "learning_rate": 1.909246575342466e-05, "loss": 1.5302, "step": 1805 }, { "epoch": 0.6183607960624866, "grad_norm": 0.25958117842674255, "learning_rate": 1.9075342465753428e-05, "loss": 1.5754, "step": 1806 }, { "epoch": 0.6187031885298523, "grad_norm": 0.2508936822414398, "learning_rate": 1.9058219178082194e-05, "loss": 1.6244, "step": 1807 }, { "epoch": 0.6190455809972181, "grad_norm": 0.22891496121883392, "learning_rate": 1.904109589041096e-05, "loss": 1.5948, "step": 1808 }, { "epoch": 0.6193879734645837, "grad_norm": 0.23719754815101624, "learning_rate": 1.9023972602739727e-05, "loss": 1.5844, "step": 1809 }, { "epoch": 0.6197303659319495, "grad_norm": 0.24350200593471527, "learning_rate": 1.9006849315068494e-05, "loss": 1.6279, "step": 1810 }, { "epoch": 0.6200727583993152, "grad_norm": 0.2544047236442566, "learning_rate": 1.898972602739726e-05, "loss": 1.5044, "step": 1811 }, { "epoch": 0.620415150866681, "grad_norm": 0.2619890868663788, "learning_rate": 1.8972602739726027e-05, "loss": 1.583, "step": 1812 }, { "epoch": 0.6207575433340466, "grad_norm": 0.24453817307949066, "learning_rate": 1.8955479452054793e-05, "loss": 1.5336, "step": 1813 }, { "epoch": 0.6210999358014123, "grad_norm": 0.227174773812294, "learning_rate": 1.8938356164383563e-05, "loss": 1.6447, "step": 1814 }, { "epoch": 0.6214423282687781, "grad_norm": 0.2645910084247589, "learning_rate": 1.892123287671233e-05, "loss": 1.7547, "step": 1815 }, { "epoch": 0.6217847207361438, "grad_norm": 0.2300589233636856, "learning_rate": 1.8904109589041096e-05, "loss": 1.5951, "step": 1816 }, { "epoch": 0.6221271132035096, "grad_norm": 0.24449603259563446, "learning_rate": 1.8886986301369863e-05, "loss": 1.5657, "step": 1817 }, { "epoch": 0.6224695056708752, "grad_norm": 0.24415864050388336, "learning_rate": 1.886986301369863e-05, "loss": 1.5172, "step": 1818 }, { "epoch": 0.622811898138241, "grad_norm": 0.2661161422729492, "learning_rate": 1.88527397260274e-05, "loss": 1.5076, "step": 1819 }, { "epoch": 0.6231542906056067, "grad_norm": 0.2215854376554489, "learning_rate": 1.8835616438356166e-05, "loss": 1.5557, "step": 1820 }, { "epoch": 0.6234966830729723, "grad_norm": 0.2614944279193878, "learning_rate": 1.8818493150684932e-05, "loss": 1.6744, "step": 1821 }, { "epoch": 0.6238390755403381, "grad_norm": 0.24360568821430206, "learning_rate": 1.88013698630137e-05, "loss": 1.6602, "step": 1822 }, { "epoch": 0.6241814680077038, "grad_norm": 0.2501567006111145, "learning_rate": 1.878424657534247e-05, "loss": 1.6518, "step": 1823 }, { "epoch": 0.6245238604750696, "grad_norm": 0.2676141560077667, "learning_rate": 1.8767123287671235e-05, "loss": 1.7147, "step": 1824 }, { "epoch": 0.6248662529424353, "grad_norm": 0.24765713512897491, "learning_rate": 1.8750000000000002e-05, "loss": 1.6256, "step": 1825 }, { "epoch": 0.625208645409801, "grad_norm": 0.24663594365119934, "learning_rate": 1.8732876712328768e-05, "loss": 1.6215, "step": 1826 }, { "epoch": 0.6255510378771667, "grad_norm": 0.23229555785655975, "learning_rate": 1.8715753424657535e-05, "loss": 1.6195, "step": 1827 }, { "epoch": 0.6258934303445324, "grad_norm": 0.27832478284835815, "learning_rate": 1.8698630136986305e-05, "loss": 1.6916, "step": 1828 }, { "epoch": 0.6262358228118982, "grad_norm": 0.3276243805885315, "learning_rate": 1.868150684931507e-05, "loss": 1.6215, "step": 1829 }, { "epoch": 0.6265782152792638, "grad_norm": 0.2449808269739151, "learning_rate": 1.8664383561643838e-05, "loss": 1.5417, "step": 1830 }, { "epoch": 0.6269206077466296, "grad_norm": 0.338556706905365, "learning_rate": 1.8647260273972604e-05, "loss": 1.4602, "step": 1831 }, { "epoch": 0.6272630002139953, "grad_norm": 0.25457945466041565, "learning_rate": 1.863013698630137e-05, "loss": 1.6445, "step": 1832 }, { "epoch": 0.6276053926813611, "grad_norm": 0.33178332448005676, "learning_rate": 1.8613013698630137e-05, "loss": 1.5883, "step": 1833 }, { "epoch": 0.6279477851487267, "grad_norm": 0.25278162956237793, "learning_rate": 1.8595890410958904e-05, "loss": 1.6113, "step": 1834 }, { "epoch": 0.6282901776160924, "grad_norm": 0.24425983428955078, "learning_rate": 1.857876712328767e-05, "loss": 1.5655, "step": 1835 }, { "epoch": 0.6286325700834582, "grad_norm": 0.23514185845851898, "learning_rate": 1.8561643835616437e-05, "loss": 1.6777, "step": 1836 }, { "epoch": 0.6289749625508239, "grad_norm": 0.2404947578907013, "learning_rate": 1.8544520547945207e-05, "loss": 1.5688, "step": 1837 }, { "epoch": 0.6293173550181896, "grad_norm": 0.23555117845535278, "learning_rate": 1.8527397260273973e-05, "loss": 1.4494, "step": 1838 }, { "epoch": 0.6296597474855553, "grad_norm": 0.3313317894935608, "learning_rate": 1.851027397260274e-05, "loss": 1.4637, "step": 1839 }, { "epoch": 0.630002139952921, "grad_norm": 0.23730339109897614, "learning_rate": 1.8493150684931506e-05, "loss": 1.6111, "step": 1840 }, { "epoch": 0.6303445324202868, "grad_norm": 0.23436102271080017, "learning_rate": 1.8476027397260276e-05, "loss": 1.5087, "step": 1841 }, { "epoch": 0.6306869248876524, "grad_norm": 0.25312164425849915, "learning_rate": 1.8458904109589043e-05, "loss": 1.5322, "step": 1842 }, { "epoch": 0.6310293173550182, "grad_norm": 0.26394742727279663, "learning_rate": 1.844178082191781e-05, "loss": 1.6691, "step": 1843 }, { "epoch": 0.6313717098223839, "grad_norm": 0.2387174367904663, "learning_rate": 1.8424657534246576e-05, "loss": 1.498, "step": 1844 }, { "epoch": 0.6317141022897497, "grad_norm": 0.24631977081298828, "learning_rate": 1.8407534246575342e-05, "loss": 1.6638, "step": 1845 }, { "epoch": 0.6320564947571153, "grad_norm": 0.2523033022880554, "learning_rate": 1.8390410958904112e-05, "loss": 1.5915, "step": 1846 }, { "epoch": 0.632398887224481, "grad_norm": 0.27156510949134827, "learning_rate": 1.837328767123288e-05, "loss": 1.6182, "step": 1847 }, { "epoch": 0.6327412796918468, "grad_norm": 0.2508176267147064, "learning_rate": 1.8356164383561645e-05, "loss": 1.5575, "step": 1848 }, { "epoch": 0.6330836721592125, "grad_norm": 0.24990050494670868, "learning_rate": 1.8339041095890412e-05, "loss": 1.6646, "step": 1849 }, { "epoch": 0.6334260646265782, "grad_norm": 0.2333010733127594, "learning_rate": 1.832191780821918e-05, "loss": 1.6364, "step": 1850 }, { "epoch": 0.6337684570939439, "grad_norm": 0.2566620111465454, "learning_rate": 1.8304794520547945e-05, "loss": 1.6354, "step": 1851 }, { "epoch": 0.6341108495613097, "grad_norm": 0.24733656644821167, "learning_rate": 1.828767123287671e-05, "loss": 1.5862, "step": 1852 }, { "epoch": 0.6344532420286754, "grad_norm": 0.2588883936405182, "learning_rate": 1.8270547945205478e-05, "loss": 1.575, "step": 1853 }, { "epoch": 0.634795634496041, "grad_norm": 0.23334038257598877, "learning_rate": 1.8253424657534248e-05, "loss": 1.6618, "step": 1854 }, { "epoch": 0.6351380269634068, "grad_norm": 0.23798468708992004, "learning_rate": 1.8236301369863014e-05, "loss": 1.5736, "step": 1855 }, { "epoch": 0.6354804194307725, "grad_norm": 0.2615557014942169, "learning_rate": 1.821917808219178e-05, "loss": 1.6008, "step": 1856 }, { "epoch": 0.6358228118981383, "grad_norm": 0.2424488216638565, "learning_rate": 1.8202054794520547e-05, "loss": 1.586, "step": 1857 }, { "epoch": 0.636165204365504, "grad_norm": 0.23130536079406738, "learning_rate": 1.8184931506849314e-05, "loss": 1.6127, "step": 1858 }, { "epoch": 0.6365075968328697, "grad_norm": 0.2706599235534668, "learning_rate": 1.8167808219178084e-05, "loss": 1.6125, "step": 1859 }, { "epoch": 0.6368499893002354, "grad_norm": 0.23830413818359375, "learning_rate": 1.815068493150685e-05, "loss": 1.6427, "step": 1860 }, { "epoch": 0.6371923817676011, "grad_norm": 0.23636266589164734, "learning_rate": 1.8133561643835617e-05, "loss": 1.3739, "step": 1861 }, { "epoch": 0.6375347742349669, "grad_norm": 0.24160435795783997, "learning_rate": 1.8116438356164383e-05, "loss": 1.6455, "step": 1862 }, { "epoch": 0.6378771667023325, "grad_norm": 0.25744178891181946, "learning_rate": 1.8099315068493153e-05, "loss": 1.5373, "step": 1863 }, { "epoch": 0.6382195591696983, "grad_norm": 0.2607007324695587, "learning_rate": 1.808219178082192e-05, "loss": 1.4743, "step": 1864 }, { "epoch": 0.638561951637064, "grad_norm": 0.24841882288455963, "learning_rate": 1.8065068493150686e-05, "loss": 1.662, "step": 1865 }, { "epoch": 0.6389043441044298, "grad_norm": 0.23941926658153534, "learning_rate": 1.8047945205479453e-05, "loss": 1.7065, "step": 1866 }, { "epoch": 0.6392467365717954, "grad_norm": 0.2557290196418762, "learning_rate": 1.803082191780822e-05, "loss": 1.5791, "step": 1867 }, { "epoch": 0.6395891290391611, "grad_norm": 0.25852131843566895, "learning_rate": 1.801369863013699e-05, "loss": 1.6526, "step": 1868 }, { "epoch": 0.6399315215065269, "grad_norm": 0.24280427396297455, "learning_rate": 1.7996575342465756e-05, "loss": 1.4328, "step": 1869 }, { "epoch": 0.6402739139738926, "grad_norm": 0.24237516522407532, "learning_rate": 1.7979452054794522e-05, "loss": 1.6439, "step": 1870 }, { "epoch": 0.6406163064412583, "grad_norm": 0.2672884166240692, "learning_rate": 1.796232876712329e-05, "loss": 1.6271, "step": 1871 }, { "epoch": 0.640958698908624, "grad_norm": 0.2611774802207947, "learning_rate": 1.7945205479452055e-05, "loss": 1.639, "step": 1872 }, { "epoch": 0.6413010913759897, "grad_norm": 0.25508472323417664, "learning_rate": 1.7928082191780822e-05, "loss": 1.7072, "step": 1873 }, { "epoch": 0.6416434838433555, "grad_norm": 0.25738662481307983, "learning_rate": 1.791095890410959e-05, "loss": 1.5398, "step": 1874 }, { "epoch": 0.6419858763107211, "grad_norm": 0.24722829461097717, "learning_rate": 1.7893835616438355e-05, "loss": 1.6295, "step": 1875 }, { "epoch": 0.6423282687780869, "grad_norm": 0.24692682921886444, "learning_rate": 1.787671232876712e-05, "loss": 1.4316, "step": 1876 }, { "epoch": 0.6426706612454526, "grad_norm": 0.2620501220226288, "learning_rate": 1.785958904109589e-05, "loss": 1.5667, "step": 1877 }, { "epoch": 0.6430130537128184, "grad_norm": 0.22550533711910248, "learning_rate": 1.7842465753424658e-05, "loss": 1.5308, "step": 1878 }, { "epoch": 0.643355446180184, "grad_norm": 0.258945494890213, "learning_rate": 1.7825342465753424e-05, "loss": 1.6258, "step": 1879 }, { "epoch": 0.6436978386475497, "grad_norm": 0.22864022850990295, "learning_rate": 1.780821917808219e-05, "loss": 1.583, "step": 1880 }, { "epoch": 0.6440402311149155, "grad_norm": 0.2306743860244751, "learning_rate": 1.779109589041096e-05, "loss": 1.539, "step": 1881 }, { "epoch": 0.6443826235822812, "grad_norm": 0.2635502219200134, "learning_rate": 1.7773972602739727e-05, "loss": 1.5455, "step": 1882 }, { "epoch": 0.6447250160496469, "grad_norm": 0.25164517760276794, "learning_rate": 1.7756849315068494e-05, "loss": 1.708, "step": 1883 }, { "epoch": 0.6450674085170126, "grad_norm": 0.2593313455581665, "learning_rate": 1.773972602739726e-05, "loss": 1.5809, "step": 1884 }, { "epoch": 0.6454098009843784, "grad_norm": 0.2642495334148407, "learning_rate": 1.7722602739726027e-05, "loss": 1.4997, "step": 1885 }, { "epoch": 0.6457521934517441, "grad_norm": 0.26354023814201355, "learning_rate": 1.7705479452054797e-05, "loss": 1.5696, "step": 1886 }, { "epoch": 0.6460945859191097, "grad_norm": 0.2605699598789215, "learning_rate": 1.7688356164383563e-05, "loss": 1.6455, "step": 1887 }, { "epoch": 0.6464369783864755, "grad_norm": 0.23437032103538513, "learning_rate": 1.767123287671233e-05, "loss": 1.5305, "step": 1888 }, { "epoch": 0.6467793708538412, "grad_norm": 0.24492321908473969, "learning_rate": 1.7654109589041096e-05, "loss": 1.5578, "step": 1889 }, { "epoch": 0.647121763321207, "grad_norm": 0.27594202756881714, "learning_rate": 1.7636986301369866e-05, "loss": 1.608, "step": 1890 }, { "epoch": 0.6474641557885726, "grad_norm": 0.2696996033191681, "learning_rate": 1.7619863013698633e-05, "loss": 1.6594, "step": 1891 }, { "epoch": 0.6478065482559384, "grad_norm": 0.2768424451351166, "learning_rate": 1.76027397260274e-05, "loss": 1.7118, "step": 1892 }, { "epoch": 0.6481489407233041, "grad_norm": 0.23507168889045715, "learning_rate": 1.7585616438356166e-05, "loss": 1.5715, "step": 1893 }, { "epoch": 0.6484913331906698, "grad_norm": 0.25488054752349854, "learning_rate": 1.7568493150684932e-05, "loss": 1.5555, "step": 1894 }, { "epoch": 0.6488337256580355, "grad_norm": 0.24027037620544434, "learning_rate": 1.75513698630137e-05, "loss": 1.4898, "step": 1895 }, { "epoch": 0.6491761181254012, "grad_norm": 0.2532329559326172, "learning_rate": 1.7534246575342465e-05, "loss": 1.4748, "step": 1896 }, { "epoch": 0.649518510592767, "grad_norm": 0.2620449960231781, "learning_rate": 1.7517123287671232e-05, "loss": 1.6161, "step": 1897 }, { "epoch": 0.6498609030601327, "grad_norm": 0.2625490427017212, "learning_rate": 1.75e-05, "loss": 1.6227, "step": 1898 }, { "epoch": 0.6502032955274984, "grad_norm": 0.26338303089141846, "learning_rate": 1.748287671232877e-05, "loss": 1.6084, "step": 1899 }, { "epoch": 0.6505456879948641, "grad_norm": 0.2349512279033661, "learning_rate": 1.7465753424657535e-05, "loss": 1.6368, "step": 1900 }, { "epoch": 0.6508880804622298, "grad_norm": 0.23242846131324768, "learning_rate": 1.74486301369863e-05, "loss": 1.6762, "step": 1901 }, { "epoch": 0.6512304729295956, "grad_norm": 0.28482428193092346, "learning_rate": 1.7431506849315068e-05, "loss": 1.5986, "step": 1902 }, { "epoch": 0.6515728653969612, "grad_norm": 0.23620587587356567, "learning_rate": 1.7414383561643838e-05, "loss": 1.6052, "step": 1903 }, { "epoch": 0.651915257864327, "grad_norm": 0.2538192570209503, "learning_rate": 1.7397260273972604e-05, "loss": 1.6045, "step": 1904 }, { "epoch": 0.6522576503316927, "grad_norm": 0.23239773511886597, "learning_rate": 1.738013698630137e-05, "loss": 1.5602, "step": 1905 }, { "epoch": 0.6526000427990584, "grad_norm": 0.2547262907028198, "learning_rate": 1.7363013698630137e-05, "loss": 1.5719, "step": 1906 }, { "epoch": 0.6529424352664241, "grad_norm": 0.23474082350730896, "learning_rate": 1.7345890410958904e-05, "loss": 1.5844, "step": 1907 }, { "epoch": 0.6532848277337898, "grad_norm": 0.24281960725784302, "learning_rate": 1.7328767123287674e-05, "loss": 1.6037, "step": 1908 }, { "epoch": 0.6536272202011556, "grad_norm": 0.2602969706058502, "learning_rate": 1.731164383561644e-05, "loss": 1.6956, "step": 1909 }, { "epoch": 0.6539696126685213, "grad_norm": 0.24691325426101685, "learning_rate": 1.7294520547945207e-05, "loss": 1.5911, "step": 1910 }, { "epoch": 0.654312005135887, "grad_norm": 0.2595890760421753, "learning_rate": 1.7277397260273973e-05, "loss": 1.4687, "step": 1911 }, { "epoch": 0.6546543976032527, "grad_norm": 0.2672761082649231, "learning_rate": 1.7260273972602743e-05, "loss": 1.5603, "step": 1912 }, { "epoch": 0.6549967900706184, "grad_norm": 0.2920830547809601, "learning_rate": 1.724315068493151e-05, "loss": 1.4866, "step": 1913 }, { "epoch": 0.6553391825379842, "grad_norm": 0.25716572999954224, "learning_rate": 1.7226027397260276e-05, "loss": 1.6169, "step": 1914 }, { "epoch": 0.6556815750053498, "grad_norm": 0.2605960965156555, "learning_rate": 1.7208904109589043e-05, "loss": 1.6431, "step": 1915 }, { "epoch": 0.6560239674727156, "grad_norm": 0.2719070017337799, "learning_rate": 1.719178082191781e-05, "loss": 1.6668, "step": 1916 }, { "epoch": 0.6563663599400813, "grad_norm": 0.2657925486564636, "learning_rate": 1.7174657534246576e-05, "loss": 1.5589, "step": 1917 }, { "epoch": 0.6567087524074471, "grad_norm": 0.25069594383239746, "learning_rate": 1.7157534246575342e-05, "loss": 1.5968, "step": 1918 }, { "epoch": 0.6570511448748128, "grad_norm": 0.2468976229429245, "learning_rate": 1.714041095890411e-05, "loss": 1.6912, "step": 1919 }, { "epoch": 0.6573935373421784, "grad_norm": 0.2506813704967499, "learning_rate": 1.7123287671232875e-05, "loss": 1.5995, "step": 1920 }, { "epoch": 0.6577359298095442, "grad_norm": 0.2524530291557312, "learning_rate": 1.7106164383561645e-05, "loss": 1.6207, "step": 1921 }, { "epoch": 0.6580783222769099, "grad_norm": 0.25653013586997986, "learning_rate": 1.7089041095890412e-05, "loss": 1.535, "step": 1922 }, { "epoch": 0.6584207147442757, "grad_norm": 0.26236391067504883, "learning_rate": 1.707191780821918e-05, "loss": 1.6549, "step": 1923 }, { "epoch": 0.6587631072116413, "grad_norm": 0.2669965922832489, "learning_rate": 1.7054794520547945e-05, "loss": 1.4626, "step": 1924 }, { "epoch": 0.6591054996790071, "grad_norm": 0.2524738013744354, "learning_rate": 1.703767123287671e-05, "loss": 1.6026, "step": 1925 }, { "epoch": 0.6594478921463728, "grad_norm": 0.23834079504013062, "learning_rate": 1.702054794520548e-05, "loss": 1.5345, "step": 1926 }, { "epoch": 0.6597902846137385, "grad_norm": 0.2605563700199127, "learning_rate": 1.7003424657534248e-05, "loss": 1.5412, "step": 1927 }, { "epoch": 0.6601326770811042, "grad_norm": 0.2550643980503082, "learning_rate": 1.6986301369863014e-05, "loss": 1.5812, "step": 1928 }, { "epoch": 0.6604750695484699, "grad_norm": 0.26429975032806396, "learning_rate": 1.696917808219178e-05, "loss": 1.5459, "step": 1929 }, { "epoch": 0.6608174620158357, "grad_norm": 0.23836760222911835, "learning_rate": 1.695205479452055e-05, "loss": 1.598, "step": 1930 }, { "epoch": 0.6611598544832014, "grad_norm": 0.2388952672481537, "learning_rate": 1.6934931506849317e-05, "loss": 1.4943, "step": 1931 }, { "epoch": 0.6615022469505671, "grad_norm": 0.2680096924304962, "learning_rate": 1.6917808219178084e-05, "loss": 1.5801, "step": 1932 }, { "epoch": 0.6618446394179328, "grad_norm": 0.27077823877334595, "learning_rate": 1.690068493150685e-05, "loss": 1.5405, "step": 1933 }, { "epoch": 0.6621870318852985, "grad_norm": 0.28568515181541443, "learning_rate": 1.6883561643835617e-05, "loss": 1.7687, "step": 1934 }, { "epoch": 0.6625294243526643, "grad_norm": 0.2331797331571579, "learning_rate": 1.6866438356164383e-05, "loss": 1.5666, "step": 1935 }, { "epoch": 0.6628718168200299, "grad_norm": 0.22544339299201965, "learning_rate": 1.684931506849315e-05, "loss": 1.5409, "step": 1936 }, { "epoch": 0.6632142092873957, "grad_norm": 0.2569091320037842, "learning_rate": 1.6832191780821917e-05, "loss": 1.5885, "step": 1937 }, { "epoch": 0.6635566017547614, "grad_norm": 0.2561245858669281, "learning_rate": 1.6815068493150683e-05, "loss": 1.606, "step": 1938 }, { "epoch": 0.6638989942221271, "grad_norm": 0.231187105178833, "learning_rate": 1.6797945205479453e-05, "loss": 1.5971, "step": 1939 }, { "epoch": 0.6642413866894928, "grad_norm": 0.24469514191150665, "learning_rate": 1.678082191780822e-05, "loss": 1.6323, "step": 1940 }, { "epoch": 0.6645837791568585, "grad_norm": 0.26917967200279236, "learning_rate": 1.6763698630136986e-05, "loss": 1.7196, "step": 1941 }, { "epoch": 0.6649261716242243, "grad_norm": 0.25939440727233887, "learning_rate": 1.6746575342465753e-05, "loss": 1.5463, "step": 1942 }, { "epoch": 0.66526856409159, "grad_norm": 0.2733328342437744, "learning_rate": 1.6729452054794522e-05, "loss": 1.6645, "step": 1943 }, { "epoch": 0.6656109565589557, "grad_norm": 0.24491265416145325, "learning_rate": 1.671232876712329e-05, "loss": 1.5619, "step": 1944 }, { "epoch": 0.6659533490263214, "grad_norm": 0.26098203659057617, "learning_rate": 1.6695205479452055e-05, "loss": 1.5799, "step": 1945 }, { "epoch": 0.6662957414936871, "grad_norm": 0.25281116366386414, "learning_rate": 1.6678082191780822e-05, "loss": 1.4976, "step": 1946 }, { "epoch": 0.6666381339610529, "grad_norm": 0.23532018065452576, "learning_rate": 1.666095890410959e-05, "loss": 1.4994, "step": 1947 }, { "epoch": 0.6669805264284185, "grad_norm": 0.25669774413108826, "learning_rate": 1.664383561643836e-05, "loss": 1.5741, "step": 1948 }, { "epoch": 0.6673229188957843, "grad_norm": 0.2724986970424652, "learning_rate": 1.6626712328767125e-05, "loss": 1.5165, "step": 1949 }, { "epoch": 0.66766531136315, "grad_norm": 0.2874421179294586, "learning_rate": 1.660958904109589e-05, "loss": 1.6458, "step": 1950 }, { "epoch": 0.6680077038305158, "grad_norm": 0.2538755238056183, "learning_rate": 1.6592465753424658e-05, "loss": 1.5164, "step": 1951 }, { "epoch": 0.6683500962978814, "grad_norm": 0.25318387150764465, "learning_rate": 1.6575342465753428e-05, "loss": 1.6412, "step": 1952 }, { "epoch": 0.6686924887652471, "grad_norm": 0.28974592685699463, "learning_rate": 1.6558219178082194e-05, "loss": 1.4793, "step": 1953 }, { "epoch": 0.6690348812326129, "grad_norm": 0.2618882358074188, "learning_rate": 1.654109589041096e-05, "loss": 1.5044, "step": 1954 }, { "epoch": 0.6693772736999786, "grad_norm": 0.2531030476093292, "learning_rate": 1.6523972602739727e-05, "loss": 1.6628, "step": 1955 }, { "epoch": 0.6697196661673444, "grad_norm": 0.26435860991477966, "learning_rate": 1.6506849315068494e-05, "loss": 1.6397, "step": 1956 }, { "epoch": 0.67006205863471, "grad_norm": 0.2588629722595215, "learning_rate": 1.648972602739726e-05, "loss": 1.4901, "step": 1957 }, { "epoch": 0.6704044511020758, "grad_norm": 0.2576694190502167, "learning_rate": 1.6472602739726027e-05, "loss": 1.5018, "step": 1958 }, { "epoch": 0.6707468435694415, "grad_norm": 0.2682690918445587, "learning_rate": 1.6455479452054794e-05, "loss": 1.5102, "step": 1959 }, { "epoch": 0.6710892360368071, "grad_norm": 0.25423675775527954, "learning_rate": 1.643835616438356e-05, "loss": 1.6207, "step": 1960 }, { "epoch": 0.6714316285041729, "grad_norm": 0.25862011313438416, "learning_rate": 1.642123287671233e-05, "loss": 1.6956, "step": 1961 }, { "epoch": 0.6717740209715386, "grad_norm": 0.24381040036678314, "learning_rate": 1.6404109589041096e-05, "loss": 1.5785, "step": 1962 }, { "epoch": 0.6721164134389044, "grad_norm": 0.2545781433582306, "learning_rate": 1.6386986301369863e-05, "loss": 1.6909, "step": 1963 }, { "epoch": 0.67245880590627, "grad_norm": 0.2956506311893463, "learning_rate": 1.636986301369863e-05, "loss": 1.6212, "step": 1964 }, { "epoch": 0.6728011983736358, "grad_norm": 0.24353796243667603, "learning_rate": 1.6352739726027396e-05, "loss": 1.5599, "step": 1965 }, { "epoch": 0.6731435908410015, "grad_norm": 0.2571417987346649, "learning_rate": 1.6335616438356166e-05, "loss": 1.6448, "step": 1966 }, { "epoch": 0.6734859833083672, "grad_norm": 0.3043387830257416, "learning_rate": 1.6318493150684932e-05, "loss": 1.595, "step": 1967 }, { "epoch": 0.673828375775733, "grad_norm": 0.2597902715206146, "learning_rate": 1.63013698630137e-05, "loss": 1.6662, "step": 1968 }, { "epoch": 0.6741707682430986, "grad_norm": 0.2343287318944931, "learning_rate": 1.6284246575342466e-05, "loss": 1.6545, "step": 1969 }, { "epoch": 0.6745131607104644, "grad_norm": 0.2495015263557434, "learning_rate": 1.6267123287671235e-05, "loss": 1.5691, "step": 1970 }, { "epoch": 0.6748555531778301, "grad_norm": 0.2806970477104187, "learning_rate": 1.6250000000000002e-05, "loss": 1.5695, "step": 1971 }, { "epoch": 0.6751979456451958, "grad_norm": 0.26707878708839417, "learning_rate": 1.623287671232877e-05, "loss": 1.6371, "step": 1972 }, { "epoch": 0.6755403381125615, "grad_norm": 0.25022363662719727, "learning_rate": 1.6215753424657535e-05, "loss": 1.5255, "step": 1973 }, { "epoch": 0.6758827305799272, "grad_norm": 0.23766542971134186, "learning_rate": 1.61986301369863e-05, "loss": 1.6527, "step": 1974 }, { "epoch": 0.676225123047293, "grad_norm": 0.2777051329612732, "learning_rate": 1.618150684931507e-05, "loss": 1.4711, "step": 1975 }, { "epoch": 0.6765675155146587, "grad_norm": 0.24787728488445282, "learning_rate": 1.6164383561643838e-05, "loss": 1.5258, "step": 1976 }, { "epoch": 0.6769099079820244, "grad_norm": 0.2461623102426529, "learning_rate": 1.6147260273972604e-05, "loss": 1.5613, "step": 1977 }, { "epoch": 0.6772523004493901, "grad_norm": 0.249303936958313, "learning_rate": 1.613013698630137e-05, "loss": 1.4777, "step": 1978 }, { "epoch": 0.6775946929167558, "grad_norm": 0.25251856446266174, "learning_rate": 1.6113013698630138e-05, "loss": 1.5358, "step": 1979 }, { "epoch": 0.6779370853841216, "grad_norm": 0.2563111484050751, "learning_rate": 1.6095890410958904e-05, "loss": 1.6222, "step": 1980 }, { "epoch": 0.6782794778514872, "grad_norm": 0.25498339533805847, "learning_rate": 1.607876712328767e-05, "loss": 1.5225, "step": 1981 }, { "epoch": 0.678621870318853, "grad_norm": 0.27756085991859436, "learning_rate": 1.6061643835616437e-05, "loss": 1.569, "step": 1982 }, { "epoch": 0.6789642627862187, "grad_norm": 0.2435324490070343, "learning_rate": 1.6044520547945204e-05, "loss": 1.5424, "step": 1983 }, { "epoch": 0.6793066552535845, "grad_norm": 0.26373839378356934, "learning_rate": 1.6027397260273974e-05, "loss": 1.5003, "step": 1984 }, { "epoch": 0.6796490477209501, "grad_norm": 0.263293594121933, "learning_rate": 1.601027397260274e-05, "loss": 1.5983, "step": 1985 }, { "epoch": 0.6799914401883158, "grad_norm": 0.24036607146263123, "learning_rate": 1.5993150684931507e-05, "loss": 1.5843, "step": 1986 }, { "epoch": 0.6803338326556816, "grad_norm": 0.2465442717075348, "learning_rate": 1.5976027397260273e-05, "loss": 1.5878, "step": 1987 }, { "epoch": 0.6806762251230473, "grad_norm": 0.259977251291275, "learning_rate": 1.5958904109589043e-05, "loss": 1.5012, "step": 1988 }, { "epoch": 0.681018617590413, "grad_norm": 0.25949224829673767, "learning_rate": 1.594178082191781e-05, "loss": 1.5546, "step": 1989 }, { "epoch": 0.6813610100577787, "grad_norm": 0.24306459724903107, "learning_rate": 1.5924657534246576e-05, "loss": 1.6005, "step": 1990 }, { "epoch": 0.6817034025251445, "grad_norm": 0.29262369871139526, "learning_rate": 1.5907534246575343e-05, "loss": 1.5619, "step": 1991 }, { "epoch": 0.6820457949925102, "grad_norm": 0.23706042766571045, "learning_rate": 1.5890410958904112e-05, "loss": 1.5293, "step": 1992 }, { "epoch": 0.6823881874598758, "grad_norm": 0.26131659746170044, "learning_rate": 1.587328767123288e-05, "loss": 1.5737, "step": 1993 }, { "epoch": 0.6827305799272416, "grad_norm": 0.2657162845134735, "learning_rate": 1.5856164383561646e-05, "loss": 1.6589, "step": 1994 }, { "epoch": 0.6830729723946073, "grad_norm": 0.24658535420894623, "learning_rate": 1.5839041095890412e-05, "loss": 1.5923, "step": 1995 }, { "epoch": 0.6834153648619731, "grad_norm": 0.25277984142303467, "learning_rate": 1.582191780821918e-05, "loss": 1.692, "step": 1996 }, { "epoch": 0.6837577573293387, "grad_norm": 0.26226678490638733, "learning_rate": 1.580479452054795e-05, "loss": 1.5528, "step": 1997 }, { "epoch": 0.6841001497967045, "grad_norm": 0.25557392835617065, "learning_rate": 1.5787671232876715e-05, "loss": 1.5951, "step": 1998 }, { "epoch": 0.6844425422640702, "grad_norm": 0.25729429721832275, "learning_rate": 1.577054794520548e-05, "loss": 1.5713, "step": 1999 }, { "epoch": 0.6847849347314359, "grad_norm": 0.24768178164958954, "learning_rate": 1.5753424657534248e-05, "loss": 1.5536, "step": 2000 }, { "epoch": 0.6851273271988016, "grad_norm": 0.2782338559627533, "learning_rate": 1.5736301369863015e-05, "loss": 1.5456, "step": 2001 }, { "epoch": 0.6854697196661673, "grad_norm": 0.2498759627342224, "learning_rate": 1.571917808219178e-05, "loss": 1.6074, "step": 2002 }, { "epoch": 0.6858121121335331, "grad_norm": 0.24261674284934998, "learning_rate": 1.5702054794520548e-05, "loss": 1.5612, "step": 2003 }, { "epoch": 0.6861545046008988, "grad_norm": 0.25065386295318604, "learning_rate": 1.5684931506849314e-05, "loss": 1.5119, "step": 2004 }, { "epoch": 0.6864968970682644, "grad_norm": 0.2774777114391327, "learning_rate": 1.566780821917808e-05, "loss": 1.5736, "step": 2005 }, { "epoch": 0.6868392895356302, "grad_norm": 0.28180718421936035, "learning_rate": 1.565068493150685e-05, "loss": 1.4624, "step": 2006 }, { "epoch": 0.6871816820029959, "grad_norm": 0.5347707271575928, "learning_rate": 1.5633561643835617e-05, "loss": 1.5355, "step": 2007 }, { "epoch": 0.6875240744703617, "grad_norm": 0.24332860112190247, "learning_rate": 1.5616438356164384e-05, "loss": 1.5337, "step": 2008 }, { "epoch": 0.6878664669377273, "grad_norm": 0.2829281687736511, "learning_rate": 1.559931506849315e-05, "loss": 1.521, "step": 2009 }, { "epoch": 0.6882088594050931, "grad_norm": 0.2735847532749176, "learning_rate": 1.558219178082192e-05, "loss": 1.5759, "step": 2010 }, { "epoch": 0.6885512518724588, "grad_norm": 0.29398006200790405, "learning_rate": 1.5565068493150687e-05, "loss": 1.7019, "step": 2011 }, { "epoch": 0.6888936443398245, "grad_norm": 0.2860855758190155, "learning_rate": 1.5547945205479453e-05, "loss": 1.6213, "step": 2012 }, { "epoch": 0.6892360368071903, "grad_norm": 0.2761414051055908, "learning_rate": 1.553082191780822e-05, "loss": 1.4769, "step": 2013 }, { "epoch": 0.6895784292745559, "grad_norm": 0.26225540041923523, "learning_rate": 1.5513698630136986e-05, "loss": 1.484, "step": 2014 }, { "epoch": 0.6899208217419217, "grad_norm": 0.2915465831756592, "learning_rate": 1.5496575342465756e-05, "loss": 1.4693, "step": 2015 }, { "epoch": 0.6902632142092874, "grad_norm": 0.30932584404945374, "learning_rate": 1.5479452054794523e-05, "loss": 1.6308, "step": 2016 }, { "epoch": 0.6906056066766532, "grad_norm": 0.2876477539539337, "learning_rate": 1.546232876712329e-05, "loss": 1.6621, "step": 2017 }, { "epoch": 0.6909479991440188, "grad_norm": 0.26623743772506714, "learning_rate": 1.5445205479452056e-05, "loss": 1.6548, "step": 2018 }, { "epoch": 0.6912903916113845, "grad_norm": 0.27221834659576416, "learning_rate": 1.5428082191780825e-05, "loss": 1.5331, "step": 2019 }, { "epoch": 0.6916327840787503, "grad_norm": 0.2906191051006317, "learning_rate": 1.541095890410959e-05, "loss": 1.6627, "step": 2020 }, { "epoch": 0.691975176546116, "grad_norm": 0.29224392771720886, "learning_rate": 1.5393835616438355e-05, "loss": 1.6122, "step": 2021 }, { "epoch": 0.6923175690134817, "grad_norm": 0.28520485758781433, "learning_rate": 1.537671232876712e-05, "loss": 1.5292, "step": 2022 }, { "epoch": 0.6926599614808474, "grad_norm": 0.2762230634689331, "learning_rate": 1.5359589041095888e-05, "loss": 1.5701, "step": 2023 }, { "epoch": 0.6930023539482132, "grad_norm": 0.2525821626186371, "learning_rate": 1.5342465753424658e-05, "loss": 1.6177, "step": 2024 }, { "epoch": 0.6933447464155789, "grad_norm": 0.2723167836666107, "learning_rate": 1.5325342465753425e-05, "loss": 1.5063, "step": 2025 }, { "epoch": 0.6936871388829445, "grad_norm": 0.2612285017967224, "learning_rate": 1.530821917808219e-05, "loss": 1.4484, "step": 2026 }, { "epoch": 0.6940295313503103, "grad_norm": 0.26502710580825806, "learning_rate": 1.5291095890410958e-05, "loss": 1.4546, "step": 2027 }, { "epoch": 0.694371923817676, "grad_norm": 0.25304746627807617, "learning_rate": 1.5273972602739728e-05, "loss": 1.5552, "step": 2028 }, { "epoch": 0.6947143162850418, "grad_norm": 0.27534130215644836, "learning_rate": 1.5256849315068494e-05, "loss": 1.6006, "step": 2029 }, { "epoch": 0.6950567087524074, "grad_norm": 0.24848972260951996, "learning_rate": 1.523972602739726e-05, "loss": 1.4888, "step": 2030 }, { "epoch": 0.6953991012197732, "grad_norm": 0.25061893463134766, "learning_rate": 1.5222602739726027e-05, "loss": 1.6208, "step": 2031 }, { "epoch": 0.6957414936871389, "grad_norm": 0.26621490716934204, "learning_rate": 1.5205479452054797e-05, "loss": 1.5526, "step": 2032 }, { "epoch": 0.6960838861545046, "grad_norm": 0.2546326816082001, "learning_rate": 1.5188356164383564e-05, "loss": 1.4814, "step": 2033 }, { "epoch": 0.6964262786218703, "grad_norm": 0.2597961723804474, "learning_rate": 1.517123287671233e-05, "loss": 1.6775, "step": 2034 }, { "epoch": 0.696768671089236, "grad_norm": 0.2755800485610962, "learning_rate": 1.5154109589041097e-05, "loss": 1.5167, "step": 2035 }, { "epoch": 0.6971110635566018, "grad_norm": 0.2838727533817291, "learning_rate": 1.5136986301369863e-05, "loss": 1.6399, "step": 2036 }, { "epoch": 0.6974534560239675, "grad_norm": 0.2674304246902466, "learning_rate": 1.5119863013698631e-05, "loss": 1.7138, "step": 2037 }, { "epoch": 0.6977958484913331, "grad_norm": 0.2503063678741455, "learning_rate": 1.5102739726027398e-05, "loss": 1.5617, "step": 2038 }, { "epoch": 0.6981382409586989, "grad_norm": 0.2678525447845459, "learning_rate": 1.5085616438356164e-05, "loss": 1.5977, "step": 2039 }, { "epoch": 0.6984806334260646, "grad_norm": 0.25200408697128296, "learning_rate": 1.5068493150684931e-05, "loss": 1.5738, "step": 2040 }, { "epoch": 0.6988230258934304, "grad_norm": 0.2532903850078583, "learning_rate": 1.50513698630137e-05, "loss": 1.5854, "step": 2041 }, { "epoch": 0.699165418360796, "grad_norm": 0.3525558412075043, "learning_rate": 1.5034246575342467e-05, "loss": 1.6339, "step": 2042 }, { "epoch": 0.6995078108281618, "grad_norm": 0.2654409408569336, "learning_rate": 1.5017123287671234e-05, "loss": 1.7072, "step": 2043 }, { "epoch": 0.6998502032955275, "grad_norm": 0.2441091388463974, "learning_rate": 1.5e-05, "loss": 1.6014, "step": 2044 }, { "epoch": 0.7001925957628932, "grad_norm": 0.25416913628578186, "learning_rate": 1.4982876712328767e-05, "loss": 1.6101, "step": 2045 }, { "epoch": 0.700534988230259, "grad_norm": 0.2631846070289612, "learning_rate": 1.4965753424657535e-05, "loss": 1.6924, "step": 2046 }, { "epoch": 0.7008773806976246, "grad_norm": 0.2862163186073303, "learning_rate": 1.4948630136986302e-05, "loss": 1.613, "step": 2047 }, { "epoch": 0.7012197731649904, "grad_norm": 0.2488991916179657, "learning_rate": 1.4931506849315068e-05, "loss": 1.5584, "step": 2048 }, { "epoch": 0.7015621656323561, "grad_norm": 0.32565411925315857, "learning_rate": 1.4914383561643835e-05, "loss": 1.5269, "step": 2049 }, { "epoch": 0.7019045580997219, "grad_norm": 0.25938135385513306, "learning_rate": 1.4897260273972605e-05, "loss": 1.5004, "step": 2050 }, { "epoch": 0.7022469505670875, "grad_norm": 0.25983816385269165, "learning_rate": 1.4880136986301371e-05, "loss": 1.6768, "step": 2051 }, { "epoch": 0.7025893430344532, "grad_norm": 0.25985971093177795, "learning_rate": 1.4863013698630138e-05, "loss": 1.7348, "step": 2052 }, { "epoch": 0.702931735501819, "grad_norm": 0.2889842391014099, "learning_rate": 1.4845890410958904e-05, "loss": 1.653, "step": 2053 }, { "epoch": 0.7032741279691846, "grad_norm": 0.23381905257701874, "learning_rate": 1.482876712328767e-05, "loss": 1.59, "step": 2054 }, { "epoch": 0.7036165204365504, "grad_norm": 0.2567083239555359, "learning_rate": 1.481164383561644e-05, "loss": 1.6117, "step": 2055 }, { "epoch": 0.7039589129039161, "grad_norm": 0.2690364122390747, "learning_rate": 1.4794520547945207e-05, "loss": 1.5979, "step": 2056 }, { "epoch": 0.7043013053712819, "grad_norm": 0.25131818652153015, "learning_rate": 1.4777397260273974e-05, "loss": 1.5123, "step": 2057 }, { "epoch": 0.7046436978386476, "grad_norm": 0.24301771819591522, "learning_rate": 1.476027397260274e-05, "loss": 1.6076, "step": 2058 }, { "epoch": 0.7049860903060132, "grad_norm": 0.2594466805458069, "learning_rate": 1.4743150684931508e-05, "loss": 1.6563, "step": 2059 }, { "epoch": 0.705328482773379, "grad_norm": 0.2568648159503937, "learning_rate": 1.4726027397260275e-05, "loss": 1.5661, "step": 2060 }, { "epoch": 0.7056708752407447, "grad_norm": 0.28389400243759155, "learning_rate": 1.4708904109589041e-05, "loss": 1.5336, "step": 2061 }, { "epoch": 0.7060132677081105, "grad_norm": 0.25658687949180603, "learning_rate": 1.4691780821917808e-05, "loss": 1.6062, "step": 2062 }, { "epoch": 0.7063556601754761, "grad_norm": 0.28182122111320496, "learning_rate": 1.4674657534246574e-05, "loss": 1.6227, "step": 2063 }, { "epoch": 0.7066980526428419, "grad_norm": 0.2700038254261017, "learning_rate": 1.4657534246575344e-05, "loss": 1.6552, "step": 2064 }, { "epoch": 0.7070404451102076, "grad_norm": 0.2638598680496216, "learning_rate": 1.4640410958904111e-05, "loss": 1.6044, "step": 2065 }, { "epoch": 0.7073828375775733, "grad_norm": 0.2452877163887024, "learning_rate": 1.4623287671232877e-05, "loss": 1.5561, "step": 2066 }, { "epoch": 0.707725230044939, "grad_norm": 0.26181289553642273, "learning_rate": 1.4606164383561644e-05, "loss": 1.4907, "step": 2067 }, { "epoch": 0.7080676225123047, "grad_norm": 0.2621123492717743, "learning_rate": 1.4589041095890412e-05, "loss": 1.6775, "step": 2068 }, { "epoch": 0.7084100149796705, "grad_norm": 0.28422755002975464, "learning_rate": 1.4571917808219179e-05, "loss": 1.5894, "step": 2069 }, { "epoch": 0.7087524074470362, "grad_norm": 0.2832430601119995, "learning_rate": 1.4554794520547945e-05, "loss": 1.5094, "step": 2070 }, { "epoch": 0.7090947999144018, "grad_norm": 0.26982057094573975, "learning_rate": 1.4537671232876712e-05, "loss": 1.5224, "step": 2071 }, { "epoch": 0.7094371923817676, "grad_norm": 0.25673747062683105, "learning_rate": 1.4520547945205478e-05, "loss": 1.6312, "step": 2072 }, { "epoch": 0.7097795848491333, "grad_norm": 0.2636537551879883, "learning_rate": 1.4503424657534248e-05, "loss": 1.6294, "step": 2073 }, { "epoch": 0.7101219773164991, "grad_norm": 0.27541449666023254, "learning_rate": 1.4486301369863015e-05, "loss": 1.4857, "step": 2074 }, { "epoch": 0.7104643697838647, "grad_norm": 0.2530026137828827, "learning_rate": 1.4469178082191781e-05, "loss": 1.6186, "step": 2075 }, { "epoch": 0.7108067622512305, "grad_norm": 0.34641698002815247, "learning_rate": 1.4452054794520548e-05, "loss": 1.5247, "step": 2076 }, { "epoch": 0.7111491547185962, "grad_norm": 0.29727989435195923, "learning_rate": 1.4434931506849318e-05, "loss": 1.5878, "step": 2077 }, { "epoch": 0.7114915471859619, "grad_norm": 0.26513928174972534, "learning_rate": 1.4417808219178084e-05, "loss": 1.5478, "step": 2078 }, { "epoch": 0.7118339396533276, "grad_norm": 0.25723400712013245, "learning_rate": 1.440068493150685e-05, "loss": 1.5212, "step": 2079 }, { "epoch": 0.7121763321206933, "grad_norm": 0.2750703692436218, "learning_rate": 1.4383561643835617e-05, "loss": 1.5278, "step": 2080 }, { "epoch": 0.7125187245880591, "grad_norm": 0.2560664713382721, "learning_rate": 1.4366438356164385e-05, "loss": 1.5173, "step": 2081 }, { "epoch": 0.7128611170554248, "grad_norm": 0.2863115966320038, "learning_rate": 1.4349315068493152e-05, "loss": 1.5474, "step": 2082 }, { "epoch": 0.7132035095227905, "grad_norm": 0.2831278443336487, "learning_rate": 1.4332191780821918e-05, "loss": 1.6514, "step": 2083 }, { "epoch": 0.7135459019901562, "grad_norm": 0.32025644183158875, "learning_rate": 1.4315068493150685e-05, "loss": 1.5902, "step": 2084 }, { "epoch": 0.7138882944575219, "grad_norm": 0.2672574520111084, "learning_rate": 1.4297945205479451e-05, "loss": 1.5339, "step": 2085 }, { "epoch": 0.7142306869248877, "grad_norm": 0.2832435369491577, "learning_rate": 1.4280821917808221e-05, "loss": 1.5738, "step": 2086 }, { "epoch": 0.7145730793922533, "grad_norm": 0.2577243745326996, "learning_rate": 1.4263698630136988e-05, "loss": 1.5632, "step": 2087 }, { "epoch": 0.7149154718596191, "grad_norm": 0.26786017417907715, "learning_rate": 1.4246575342465754e-05, "loss": 1.6415, "step": 2088 }, { "epoch": 0.7152578643269848, "grad_norm": 0.2842353582382202, "learning_rate": 1.4229452054794521e-05, "loss": 1.6155, "step": 2089 }, { "epoch": 0.7156002567943506, "grad_norm": 0.2508132755756378, "learning_rate": 1.421232876712329e-05, "loss": 1.5659, "step": 2090 }, { "epoch": 0.7159426492617162, "grad_norm": 0.31312957406044006, "learning_rate": 1.4195205479452056e-05, "loss": 1.5504, "step": 2091 }, { "epoch": 0.7162850417290819, "grad_norm": 0.2715624272823334, "learning_rate": 1.4178082191780822e-05, "loss": 1.6515, "step": 2092 }, { "epoch": 0.7166274341964477, "grad_norm": 0.25817176699638367, "learning_rate": 1.4160958904109589e-05, "loss": 1.5249, "step": 2093 }, { "epoch": 0.7169698266638134, "grad_norm": 0.24463458359241486, "learning_rate": 1.4143835616438355e-05, "loss": 1.4799, "step": 2094 }, { "epoch": 0.7173122191311792, "grad_norm": 0.26421549916267395, "learning_rate": 1.4126712328767125e-05, "loss": 1.6266, "step": 2095 }, { "epoch": 0.7176546115985448, "grad_norm": 0.2808419167995453, "learning_rate": 1.4109589041095892e-05, "loss": 1.6297, "step": 2096 }, { "epoch": 0.7179970040659106, "grad_norm": 0.27540233731269836, "learning_rate": 1.4092465753424658e-05, "loss": 1.6568, "step": 2097 }, { "epoch": 0.7183393965332763, "grad_norm": 0.27033793926239014, "learning_rate": 1.4075342465753425e-05, "loss": 1.5957, "step": 2098 }, { "epoch": 0.718681789000642, "grad_norm": 0.2604904770851135, "learning_rate": 1.4058219178082193e-05, "loss": 1.6104, "step": 2099 }, { "epoch": 0.7190241814680077, "grad_norm": 0.249342143535614, "learning_rate": 1.404109589041096e-05, "loss": 1.5053, "step": 2100 }, { "epoch": 0.7193665739353734, "grad_norm": 0.2715243995189667, "learning_rate": 1.4023972602739726e-05, "loss": 1.6017, "step": 2101 }, { "epoch": 0.7197089664027392, "grad_norm": 0.26346147060394287, "learning_rate": 1.4006849315068493e-05, "loss": 1.6121, "step": 2102 }, { "epoch": 0.7200513588701049, "grad_norm": 0.2623075544834137, "learning_rate": 1.3989726027397259e-05, "loss": 1.5571, "step": 2103 }, { "epoch": 0.7203937513374705, "grad_norm": 0.25506675243377686, "learning_rate": 1.3972602739726029e-05, "loss": 1.4542, "step": 2104 }, { "epoch": 0.7207361438048363, "grad_norm": 0.2747322916984558, "learning_rate": 1.3955479452054795e-05, "loss": 1.6431, "step": 2105 }, { "epoch": 0.721078536272202, "grad_norm": 0.2966254949569702, "learning_rate": 1.3938356164383562e-05, "loss": 1.4964, "step": 2106 }, { "epoch": 0.7214209287395678, "grad_norm": 0.26297664642333984, "learning_rate": 1.3921232876712328e-05, "loss": 1.5676, "step": 2107 }, { "epoch": 0.7217633212069334, "grad_norm": 0.2579248547554016, "learning_rate": 1.3904109589041098e-05, "loss": 1.6482, "step": 2108 }, { "epoch": 0.7221057136742992, "grad_norm": 0.2682496905326843, "learning_rate": 1.3886986301369865e-05, "loss": 1.6152, "step": 2109 }, { "epoch": 0.7224481061416649, "grad_norm": 0.38905996084213257, "learning_rate": 1.3869863013698631e-05, "loss": 1.5251, "step": 2110 }, { "epoch": 0.7227904986090306, "grad_norm": 0.2691227197647095, "learning_rate": 1.3852739726027398e-05, "loss": 1.6318, "step": 2111 }, { "epoch": 0.7231328910763963, "grad_norm": 0.29711592197418213, "learning_rate": 1.3835616438356164e-05, "loss": 1.7085, "step": 2112 }, { "epoch": 0.723475283543762, "grad_norm": 0.2914024889469147, "learning_rate": 1.3818493150684933e-05, "loss": 1.4825, "step": 2113 }, { "epoch": 0.7238176760111278, "grad_norm": 0.2804053723812103, "learning_rate": 1.38013698630137e-05, "loss": 1.6271, "step": 2114 }, { "epoch": 0.7241600684784935, "grad_norm": 0.30433928966522217, "learning_rate": 1.3784246575342466e-05, "loss": 1.5135, "step": 2115 }, { "epoch": 0.7245024609458592, "grad_norm": 0.26897379755973816, "learning_rate": 1.3767123287671232e-05, "loss": 1.6623, "step": 2116 }, { "epoch": 0.7248448534132249, "grad_norm": 0.2402353137731552, "learning_rate": 1.3750000000000002e-05, "loss": 1.6625, "step": 2117 }, { "epoch": 0.7251872458805906, "grad_norm": 0.2867681384086609, "learning_rate": 1.3732876712328769e-05, "loss": 1.6068, "step": 2118 }, { "epoch": 0.7255296383479564, "grad_norm": 0.26743242144584656, "learning_rate": 1.3715753424657535e-05, "loss": 1.618, "step": 2119 }, { "epoch": 0.725872030815322, "grad_norm": 0.2872251868247986, "learning_rate": 1.3698630136986302e-05, "loss": 1.5576, "step": 2120 }, { "epoch": 0.7262144232826878, "grad_norm": 0.2669619023799896, "learning_rate": 1.3681506849315068e-05, "loss": 1.4395, "step": 2121 }, { "epoch": 0.7265568157500535, "grad_norm": 0.263827383518219, "learning_rate": 1.3664383561643836e-05, "loss": 1.5506, "step": 2122 }, { "epoch": 0.7268992082174193, "grad_norm": 0.2776315212249756, "learning_rate": 1.3647260273972603e-05, "loss": 1.4362, "step": 2123 }, { "epoch": 0.7272416006847849, "grad_norm": 0.296470046043396, "learning_rate": 1.363013698630137e-05, "loss": 1.6622, "step": 2124 }, { "epoch": 0.7275839931521506, "grad_norm": 0.26814761757850647, "learning_rate": 1.3613013698630136e-05, "loss": 1.5934, "step": 2125 }, { "epoch": 0.7279263856195164, "grad_norm": 0.3541673719882965, "learning_rate": 1.3595890410958906e-05, "loss": 1.6435, "step": 2126 }, { "epoch": 0.7282687780868821, "grad_norm": 0.27484214305877686, "learning_rate": 1.3578767123287672e-05, "loss": 1.6615, "step": 2127 }, { "epoch": 0.7286111705542478, "grad_norm": 0.29939505457878113, "learning_rate": 1.3561643835616439e-05, "loss": 1.5943, "step": 2128 }, { "epoch": 0.7289535630216135, "grad_norm": 0.2863679528236389, "learning_rate": 1.3544520547945206e-05, "loss": 1.4805, "step": 2129 }, { "epoch": 0.7292959554889793, "grad_norm": 0.2723521292209625, "learning_rate": 1.3527397260273974e-05, "loss": 1.6705, "step": 2130 }, { "epoch": 0.729638347956345, "grad_norm": 0.263724684715271, "learning_rate": 1.351027397260274e-05, "loss": 1.5439, "step": 2131 }, { "epoch": 0.7299807404237106, "grad_norm": 0.28939250111579895, "learning_rate": 1.3493150684931507e-05, "loss": 1.7238, "step": 2132 }, { "epoch": 0.7303231328910764, "grad_norm": 0.2517257332801819, "learning_rate": 1.3476027397260273e-05, "loss": 1.5544, "step": 2133 }, { "epoch": 0.7306655253584421, "grad_norm": 0.27395716309547424, "learning_rate": 1.345890410958904e-05, "loss": 1.5824, "step": 2134 }, { "epoch": 0.7310079178258079, "grad_norm": 0.26879164576530457, "learning_rate": 1.344178082191781e-05, "loss": 1.5423, "step": 2135 }, { "epoch": 0.7313503102931735, "grad_norm": 0.2587811350822449, "learning_rate": 1.3424657534246576e-05, "loss": 1.5412, "step": 2136 }, { "epoch": 0.7316927027605392, "grad_norm": 0.2640549838542938, "learning_rate": 1.3407534246575343e-05, "loss": 1.656, "step": 2137 }, { "epoch": 0.732035095227905, "grad_norm": 0.2794036269187927, "learning_rate": 1.339041095890411e-05, "loss": 1.5627, "step": 2138 }, { "epoch": 0.7323774876952707, "grad_norm": 0.280062198638916, "learning_rate": 1.337328767123288e-05, "loss": 1.5009, "step": 2139 }, { "epoch": 0.7327198801626364, "grad_norm": 0.2703433930873871, "learning_rate": 1.3356164383561646e-05, "loss": 1.6795, "step": 2140 }, { "epoch": 0.7330622726300021, "grad_norm": 0.2589283585548401, "learning_rate": 1.3339041095890412e-05, "loss": 1.5316, "step": 2141 }, { "epoch": 0.7334046650973679, "grad_norm": 0.3001638948917389, "learning_rate": 1.3321917808219179e-05, "loss": 1.5587, "step": 2142 }, { "epoch": 0.7337470575647336, "grad_norm": 0.26155710220336914, "learning_rate": 1.3304794520547945e-05, "loss": 1.509, "step": 2143 }, { "epoch": 0.7340894500320992, "grad_norm": 0.2597780227661133, "learning_rate": 1.3287671232876714e-05, "loss": 1.4969, "step": 2144 }, { "epoch": 0.734431842499465, "grad_norm": 0.27202972769737244, "learning_rate": 1.327054794520548e-05, "loss": 1.5284, "step": 2145 }, { "epoch": 0.7347742349668307, "grad_norm": 0.30399683117866516, "learning_rate": 1.3253424657534247e-05, "loss": 1.5701, "step": 2146 }, { "epoch": 0.7351166274341965, "grad_norm": 0.27715975046157837, "learning_rate": 1.3236301369863013e-05, "loss": 1.6374, "step": 2147 }, { "epoch": 0.7354590199015621, "grad_norm": 0.289141982793808, "learning_rate": 1.3219178082191783e-05, "loss": 1.6005, "step": 2148 }, { "epoch": 0.7358014123689279, "grad_norm": 0.2891051471233368, "learning_rate": 1.320205479452055e-05, "loss": 1.551, "step": 2149 }, { "epoch": 0.7361438048362936, "grad_norm": 0.27955347299575806, "learning_rate": 1.3184931506849316e-05, "loss": 1.6072, "step": 2150 }, { "epoch": 0.7364861973036593, "grad_norm": 0.26083534955978394, "learning_rate": 1.3167808219178083e-05, "loss": 1.5695, "step": 2151 }, { "epoch": 0.736828589771025, "grad_norm": 0.27135106921195984, "learning_rate": 1.3150684931506849e-05, "loss": 1.6557, "step": 2152 }, { "epoch": 0.7371709822383907, "grad_norm": 0.4470581114292145, "learning_rate": 1.3133561643835617e-05, "loss": 1.579, "step": 2153 }, { "epoch": 0.7375133747057565, "grad_norm": 0.2695130407810211, "learning_rate": 1.3116438356164384e-05, "loss": 1.6215, "step": 2154 }, { "epoch": 0.7378557671731222, "grad_norm": 0.2674894630908966, "learning_rate": 1.309931506849315e-05, "loss": 1.5409, "step": 2155 }, { "epoch": 0.738198159640488, "grad_norm": 0.2771022915840149, "learning_rate": 1.3082191780821917e-05, "loss": 1.6069, "step": 2156 }, { "epoch": 0.7385405521078536, "grad_norm": 0.28364595770835876, "learning_rate": 1.3065068493150687e-05, "loss": 1.5152, "step": 2157 }, { "epoch": 0.7388829445752193, "grad_norm": 0.2658699154853821, "learning_rate": 1.3047945205479453e-05, "loss": 1.55, "step": 2158 }, { "epoch": 0.7392253370425851, "grad_norm": 0.2650466859340668, "learning_rate": 1.303082191780822e-05, "loss": 1.4935, "step": 2159 }, { "epoch": 0.7395677295099508, "grad_norm": 0.28956303000450134, "learning_rate": 1.3013698630136986e-05, "loss": 1.6303, "step": 2160 }, { "epoch": 0.7399101219773165, "grad_norm": 0.2732807695865631, "learning_rate": 1.2996575342465753e-05, "loss": 1.5551, "step": 2161 }, { "epoch": 0.7402525144446822, "grad_norm": 0.2572910785675049, "learning_rate": 1.2979452054794523e-05, "loss": 1.5642, "step": 2162 }, { "epoch": 0.740594906912048, "grad_norm": 0.2850058972835541, "learning_rate": 1.296232876712329e-05, "loss": 1.4037, "step": 2163 }, { "epoch": 0.7409372993794137, "grad_norm": 0.32316523790359497, "learning_rate": 1.2945205479452056e-05, "loss": 1.6868, "step": 2164 }, { "epoch": 0.7412796918467793, "grad_norm": 0.28773996233940125, "learning_rate": 1.2928082191780822e-05, "loss": 1.5472, "step": 2165 }, { "epoch": 0.7416220843141451, "grad_norm": 0.2530982494354248, "learning_rate": 1.291095890410959e-05, "loss": 1.5742, "step": 2166 }, { "epoch": 0.7419644767815108, "grad_norm": 0.27355462312698364, "learning_rate": 1.2893835616438357e-05, "loss": 1.4988, "step": 2167 }, { "epoch": 0.7423068692488766, "grad_norm": 0.26394400000572205, "learning_rate": 1.2876712328767124e-05, "loss": 1.5103, "step": 2168 }, { "epoch": 0.7426492617162422, "grad_norm": 0.3008158802986145, "learning_rate": 1.285958904109589e-05, "loss": 1.7228, "step": 2169 }, { "epoch": 0.7429916541836079, "grad_norm": 0.26472947001457214, "learning_rate": 1.284246575342466e-05, "loss": 1.5971, "step": 2170 }, { "epoch": 0.7433340466509737, "grad_norm": 0.26651516556739807, "learning_rate": 1.2825342465753427e-05, "loss": 1.484, "step": 2171 }, { "epoch": 0.7436764391183394, "grad_norm": 0.2697823643684387, "learning_rate": 1.2808219178082193e-05, "loss": 1.5645, "step": 2172 }, { "epoch": 0.7440188315857051, "grad_norm": 0.27258095145225525, "learning_rate": 1.279109589041096e-05, "loss": 1.6125, "step": 2173 }, { "epoch": 0.7443612240530708, "grad_norm": 0.25451579689979553, "learning_rate": 1.2773972602739726e-05, "loss": 1.4446, "step": 2174 }, { "epoch": 0.7447036165204366, "grad_norm": 0.26929229497909546, "learning_rate": 1.2756849315068494e-05, "loss": 1.6166, "step": 2175 }, { "epoch": 0.7450460089878023, "grad_norm": 0.26167070865631104, "learning_rate": 1.273972602739726e-05, "loss": 1.6152, "step": 2176 }, { "epoch": 0.7453884014551679, "grad_norm": 0.27691420912742615, "learning_rate": 1.2722602739726027e-05, "loss": 1.5663, "step": 2177 }, { "epoch": 0.7457307939225337, "grad_norm": 0.2763233482837677, "learning_rate": 1.2705479452054794e-05, "loss": 1.6187, "step": 2178 }, { "epoch": 0.7460731863898994, "grad_norm": 0.2634114623069763, "learning_rate": 1.2688356164383564e-05, "loss": 1.5769, "step": 2179 }, { "epoch": 0.7464155788572652, "grad_norm": 0.3014259934425354, "learning_rate": 1.267123287671233e-05, "loss": 1.5851, "step": 2180 }, { "epoch": 0.7467579713246308, "grad_norm": 0.2764853835105896, "learning_rate": 1.2654109589041097e-05, "loss": 1.5596, "step": 2181 }, { "epoch": 0.7471003637919966, "grad_norm": 0.28006643056869507, "learning_rate": 1.2636986301369863e-05, "loss": 1.6324, "step": 2182 }, { "epoch": 0.7474427562593623, "grad_norm": 0.2764187157154083, "learning_rate": 1.261986301369863e-05, "loss": 1.6466, "step": 2183 }, { "epoch": 0.747785148726728, "grad_norm": 0.28388792276382446, "learning_rate": 1.2602739726027398e-05, "loss": 1.6191, "step": 2184 }, { "epoch": 0.7481275411940937, "grad_norm": 0.2845175564289093, "learning_rate": 1.2585616438356165e-05, "loss": 1.4722, "step": 2185 }, { "epoch": 0.7484699336614594, "grad_norm": 0.26731207966804504, "learning_rate": 1.2568493150684931e-05, "loss": 1.5928, "step": 2186 }, { "epoch": 0.7488123261288252, "grad_norm": 0.2693977653980255, "learning_rate": 1.2551369863013698e-05, "loss": 1.5549, "step": 2187 }, { "epoch": 0.7491547185961909, "grad_norm": 0.2704068422317505, "learning_rate": 1.2534246575342468e-05, "loss": 1.5658, "step": 2188 }, { "epoch": 0.7494971110635567, "grad_norm": 0.27040895819664, "learning_rate": 1.2517123287671234e-05, "loss": 1.5878, "step": 2189 }, { "epoch": 0.7498395035309223, "grad_norm": 0.26792481541633606, "learning_rate": 1.25e-05, "loss": 1.5841, "step": 2190 }, { "epoch": 0.750181895998288, "grad_norm": 0.26262909173965454, "learning_rate": 1.2482876712328767e-05, "loss": 1.6076, "step": 2191 }, { "epoch": 0.7505242884656538, "grad_norm": 0.2777175307273865, "learning_rate": 1.2465753424657535e-05, "loss": 1.6005, "step": 2192 }, { "epoch": 0.7508666809330194, "grad_norm": 0.24694746732711792, "learning_rate": 1.2448630136986302e-05, "loss": 1.6413, "step": 2193 }, { "epoch": 0.7512090734003852, "grad_norm": 0.2673298418521881, "learning_rate": 1.243150684931507e-05, "loss": 1.5482, "step": 2194 }, { "epoch": 0.7515514658677509, "grad_norm": 0.2829790711402893, "learning_rate": 1.2414383561643837e-05, "loss": 1.5393, "step": 2195 }, { "epoch": 0.7518938583351167, "grad_norm": 0.26457279920578003, "learning_rate": 1.2397260273972603e-05, "loss": 1.5279, "step": 2196 }, { "epoch": 0.7522362508024824, "grad_norm": 0.25847336649894714, "learning_rate": 1.238013698630137e-05, "loss": 1.6121, "step": 2197 }, { "epoch": 0.752578643269848, "grad_norm": 0.2662985324859619, "learning_rate": 1.2363013698630138e-05, "loss": 1.5465, "step": 2198 }, { "epoch": 0.7529210357372138, "grad_norm": 0.2802906334400177, "learning_rate": 1.2345890410958904e-05, "loss": 1.6714, "step": 2199 }, { "epoch": 0.7532634282045795, "grad_norm": 0.35243329405784607, "learning_rate": 1.2328767123287671e-05, "loss": 1.5967, "step": 2200 }, { "epoch": 0.7536058206719453, "grad_norm": 0.34898531436920166, "learning_rate": 1.2311643835616439e-05, "loss": 1.5486, "step": 2201 }, { "epoch": 0.7539482131393109, "grad_norm": 0.2655904293060303, "learning_rate": 1.2294520547945206e-05, "loss": 1.5904, "step": 2202 }, { "epoch": 0.7542906056066766, "grad_norm": 0.26596152782440186, "learning_rate": 1.2277397260273974e-05, "loss": 1.5276, "step": 2203 }, { "epoch": 0.7546329980740424, "grad_norm": 0.26157116889953613, "learning_rate": 1.226027397260274e-05, "loss": 1.6568, "step": 2204 }, { "epoch": 0.754975390541408, "grad_norm": 0.2846357524394989, "learning_rate": 1.2243150684931509e-05, "loss": 1.5654, "step": 2205 }, { "epoch": 0.7553177830087738, "grad_norm": 0.26041707396507263, "learning_rate": 1.2226027397260275e-05, "loss": 1.4612, "step": 2206 }, { "epoch": 0.7556601754761395, "grad_norm": 0.2642638087272644, "learning_rate": 1.2208904109589042e-05, "loss": 1.5475, "step": 2207 }, { "epoch": 0.7560025679435053, "grad_norm": 0.2705996632575989, "learning_rate": 1.2191780821917808e-05, "loss": 1.6198, "step": 2208 }, { "epoch": 0.756344960410871, "grad_norm": 0.2819412350654602, "learning_rate": 1.2174657534246576e-05, "loss": 1.5228, "step": 2209 }, { "epoch": 0.7566873528782366, "grad_norm": 0.2668724060058594, "learning_rate": 1.2157534246575343e-05, "loss": 1.5562, "step": 2210 }, { "epoch": 0.7570297453456024, "grad_norm": 0.254512220621109, "learning_rate": 1.214041095890411e-05, "loss": 1.519, "step": 2211 }, { "epoch": 0.7573721378129681, "grad_norm": 0.307774156332016, "learning_rate": 1.2123287671232878e-05, "loss": 1.4132, "step": 2212 }, { "epoch": 0.7577145302803339, "grad_norm": 0.28151440620422363, "learning_rate": 1.2106164383561644e-05, "loss": 1.5817, "step": 2213 }, { "epoch": 0.7580569227476995, "grad_norm": 0.28801974654197693, "learning_rate": 1.2089041095890412e-05, "loss": 1.5561, "step": 2214 }, { "epoch": 0.7583993152150653, "grad_norm": 0.2683640420436859, "learning_rate": 1.2071917808219179e-05, "loss": 1.6232, "step": 2215 }, { "epoch": 0.758741707682431, "grad_norm": 0.2574321925640106, "learning_rate": 1.2054794520547945e-05, "loss": 1.5027, "step": 2216 }, { "epoch": 0.7590841001497967, "grad_norm": 0.2928440272808075, "learning_rate": 1.2037671232876712e-05, "loss": 1.5811, "step": 2217 }, { "epoch": 0.7594264926171624, "grad_norm": 0.272882878780365, "learning_rate": 1.202054794520548e-05, "loss": 1.583, "step": 2218 }, { "epoch": 0.7597688850845281, "grad_norm": 0.30926814675331116, "learning_rate": 1.2003424657534247e-05, "loss": 1.6377, "step": 2219 }, { "epoch": 0.7601112775518939, "grad_norm": 0.260731965303421, "learning_rate": 1.1986301369863013e-05, "loss": 1.5697, "step": 2220 }, { "epoch": 0.7604536700192596, "grad_norm": 0.258326917886734, "learning_rate": 1.1969178082191781e-05, "loss": 1.5645, "step": 2221 }, { "epoch": 0.7607960624866253, "grad_norm": 0.2878527343273163, "learning_rate": 1.1952054794520548e-05, "loss": 1.6522, "step": 2222 }, { "epoch": 0.761138454953991, "grad_norm": 0.2710413932800293, "learning_rate": 1.1934931506849316e-05, "loss": 1.5706, "step": 2223 }, { "epoch": 0.7614808474213567, "grad_norm": 0.26811301708221436, "learning_rate": 1.1917808219178083e-05, "loss": 1.6278, "step": 2224 }, { "epoch": 0.7618232398887225, "grad_norm": 0.2661469876766205, "learning_rate": 1.1900684931506851e-05, "loss": 1.6464, "step": 2225 }, { "epoch": 0.7621656323560881, "grad_norm": 0.2612219452857971, "learning_rate": 1.1883561643835617e-05, "loss": 1.6069, "step": 2226 }, { "epoch": 0.7625080248234539, "grad_norm": 0.2505891025066376, "learning_rate": 1.1866438356164384e-05, "loss": 1.4529, "step": 2227 }, { "epoch": 0.7628504172908196, "grad_norm": 0.28345805406570435, "learning_rate": 1.184931506849315e-05, "loss": 1.5174, "step": 2228 }, { "epoch": 0.7631928097581854, "grad_norm": 0.2553853690624237, "learning_rate": 1.1832191780821919e-05, "loss": 1.5643, "step": 2229 }, { "epoch": 0.763535202225551, "grad_norm": 0.275754451751709, "learning_rate": 1.1815068493150685e-05, "loss": 1.5438, "step": 2230 }, { "epoch": 0.7638775946929167, "grad_norm": 0.2811417579650879, "learning_rate": 1.1797945205479452e-05, "loss": 1.6654, "step": 2231 }, { "epoch": 0.7642199871602825, "grad_norm": 0.29108938574790955, "learning_rate": 1.178082191780822e-05, "loss": 1.5533, "step": 2232 }, { "epoch": 0.7645623796276482, "grad_norm": 0.26094475388526917, "learning_rate": 1.1763698630136986e-05, "loss": 1.6092, "step": 2233 }, { "epoch": 0.764904772095014, "grad_norm": 0.3150753676891327, "learning_rate": 1.1746575342465755e-05, "loss": 1.4436, "step": 2234 }, { "epoch": 0.7652471645623796, "grad_norm": 0.297423779964447, "learning_rate": 1.1729452054794521e-05, "loss": 1.5843, "step": 2235 }, { "epoch": 0.7655895570297453, "grad_norm": 0.28543540835380554, "learning_rate": 1.171232876712329e-05, "loss": 1.5012, "step": 2236 }, { "epoch": 0.7659319494971111, "grad_norm": 0.2781859040260315, "learning_rate": 1.1695205479452056e-05, "loss": 1.6769, "step": 2237 }, { "epoch": 0.7662743419644767, "grad_norm": 0.2924704849720001, "learning_rate": 1.1678082191780822e-05, "loss": 1.6324, "step": 2238 }, { "epoch": 0.7666167344318425, "grad_norm": 0.2914906144142151, "learning_rate": 1.1660958904109589e-05, "loss": 1.549, "step": 2239 }, { "epoch": 0.7669591268992082, "grad_norm": 0.27102041244506836, "learning_rate": 1.1643835616438355e-05, "loss": 1.4787, "step": 2240 }, { "epoch": 0.767301519366574, "grad_norm": 0.25915035605430603, "learning_rate": 1.1626712328767124e-05, "loss": 1.609, "step": 2241 }, { "epoch": 0.7676439118339397, "grad_norm": 0.2937549650669098, "learning_rate": 1.160958904109589e-05, "loss": 1.7123, "step": 2242 }, { "epoch": 0.7679863043013053, "grad_norm": 0.29813432693481445, "learning_rate": 1.1592465753424658e-05, "loss": 1.5153, "step": 2243 }, { "epoch": 0.7683286967686711, "grad_norm": 0.2737433910369873, "learning_rate": 1.1575342465753425e-05, "loss": 1.5862, "step": 2244 }, { "epoch": 0.7686710892360368, "grad_norm": 0.26942726969718933, "learning_rate": 1.1558219178082193e-05, "loss": 1.6575, "step": 2245 }, { "epoch": 0.7690134817034026, "grad_norm": 0.2958895266056061, "learning_rate": 1.154109589041096e-05, "loss": 1.5742, "step": 2246 }, { "epoch": 0.7693558741707682, "grad_norm": 0.2850814163684845, "learning_rate": 1.1523972602739728e-05, "loss": 1.6106, "step": 2247 }, { "epoch": 0.769698266638134, "grad_norm": 0.2746683359146118, "learning_rate": 1.1506849315068494e-05, "loss": 1.6311, "step": 2248 }, { "epoch": 0.7700406591054997, "grad_norm": 0.30133309960365295, "learning_rate": 1.1489726027397261e-05, "loss": 1.5716, "step": 2249 }, { "epoch": 0.7703830515728654, "grad_norm": 0.2928394675254822, "learning_rate": 1.1472602739726027e-05, "loss": 1.4978, "step": 2250 }, { "epoch": 0.7707254440402311, "grad_norm": 0.29959431290626526, "learning_rate": 1.1455479452054794e-05, "loss": 1.4929, "step": 2251 }, { "epoch": 0.7710678365075968, "grad_norm": 0.29881492257118225, "learning_rate": 1.1438356164383562e-05, "loss": 1.6296, "step": 2252 }, { "epoch": 0.7714102289749626, "grad_norm": 0.28122639656066895, "learning_rate": 1.1421232876712329e-05, "loss": 1.4099, "step": 2253 }, { "epoch": 0.7717526214423283, "grad_norm": 0.33073690533638, "learning_rate": 1.1404109589041097e-05, "loss": 1.5652, "step": 2254 }, { "epoch": 0.772095013909694, "grad_norm": 0.28068751096725464, "learning_rate": 1.1386986301369863e-05, "loss": 1.5639, "step": 2255 }, { "epoch": 0.7724374063770597, "grad_norm": 0.31994661688804626, "learning_rate": 1.1369863013698632e-05, "loss": 1.5475, "step": 2256 }, { "epoch": 0.7727797988444254, "grad_norm": 0.2511104643344879, "learning_rate": 1.1352739726027398e-05, "loss": 1.4946, "step": 2257 }, { "epoch": 0.7731221913117912, "grad_norm": 0.3240419626235962, "learning_rate": 1.1335616438356165e-05, "loss": 1.5648, "step": 2258 }, { "epoch": 0.7734645837791568, "grad_norm": 0.3068227469921112, "learning_rate": 1.1318493150684931e-05, "loss": 1.6041, "step": 2259 }, { "epoch": 0.7738069762465226, "grad_norm": 0.28601759672164917, "learning_rate": 1.1301369863013698e-05, "loss": 1.5446, "step": 2260 }, { "epoch": 0.7741493687138883, "grad_norm": 0.2815132439136505, "learning_rate": 1.1284246575342466e-05, "loss": 1.5045, "step": 2261 }, { "epoch": 0.774491761181254, "grad_norm": 0.28644731640815735, "learning_rate": 1.1267123287671232e-05, "loss": 1.556, "step": 2262 }, { "epoch": 0.7748341536486197, "grad_norm": 0.2792294919490814, "learning_rate": 1.125e-05, "loss": 1.558, "step": 2263 }, { "epoch": 0.7751765461159854, "grad_norm": 0.2844025492668152, "learning_rate": 1.1232876712328767e-05, "loss": 1.5441, "step": 2264 }, { "epoch": 0.7755189385833512, "grad_norm": 0.2953088879585266, "learning_rate": 1.1215753424657535e-05, "loss": 1.7271, "step": 2265 }, { "epoch": 0.7758613310507169, "grad_norm": 0.27398681640625, "learning_rate": 1.1198630136986302e-05, "loss": 1.613, "step": 2266 }, { "epoch": 0.7762037235180826, "grad_norm": 0.25879690051078796, "learning_rate": 1.118150684931507e-05, "loss": 1.4499, "step": 2267 }, { "epoch": 0.7765461159854483, "grad_norm": 0.2921315133571625, "learning_rate": 1.1164383561643837e-05, "loss": 1.5415, "step": 2268 }, { "epoch": 0.776888508452814, "grad_norm": 0.2870663106441498, "learning_rate": 1.1147260273972603e-05, "loss": 1.5657, "step": 2269 }, { "epoch": 0.7772309009201798, "grad_norm": 0.2566692531108856, "learning_rate": 1.113013698630137e-05, "loss": 1.5432, "step": 2270 }, { "epoch": 0.7775732933875454, "grad_norm": 1.2904773950576782, "learning_rate": 1.1113013698630136e-05, "loss": 1.544, "step": 2271 }, { "epoch": 0.7779156858549112, "grad_norm": 0.2672540545463562, "learning_rate": 1.1095890410958904e-05, "loss": 1.5443, "step": 2272 }, { "epoch": 0.7782580783222769, "grad_norm": 0.29232257604599, "learning_rate": 1.1078767123287671e-05, "loss": 1.5243, "step": 2273 }, { "epoch": 0.7786004707896427, "grad_norm": 0.31850412487983704, "learning_rate": 1.106164383561644e-05, "loss": 1.6401, "step": 2274 }, { "epoch": 0.7789428632570083, "grad_norm": 0.2849978506565094, "learning_rate": 1.1044520547945206e-05, "loss": 1.6851, "step": 2275 }, { "epoch": 0.779285255724374, "grad_norm": 0.2749882936477661, "learning_rate": 1.1027397260273974e-05, "loss": 1.6051, "step": 2276 }, { "epoch": 0.7796276481917398, "grad_norm": 0.2665223777294159, "learning_rate": 1.101027397260274e-05, "loss": 1.5856, "step": 2277 }, { "epoch": 0.7799700406591055, "grad_norm": 0.2724306285381317, "learning_rate": 1.0993150684931509e-05, "loss": 1.5452, "step": 2278 }, { "epoch": 0.7803124331264712, "grad_norm": 0.28408876061439514, "learning_rate": 1.0976027397260275e-05, "loss": 1.6143, "step": 2279 }, { "epoch": 0.7806548255938369, "grad_norm": 0.26537930965423584, "learning_rate": 1.0958904109589042e-05, "loss": 1.5516, "step": 2280 }, { "epoch": 0.7809972180612027, "grad_norm": 0.29100069403648376, "learning_rate": 1.0941780821917808e-05, "loss": 1.5915, "step": 2281 }, { "epoch": 0.7813396105285684, "grad_norm": 0.28603678941726685, "learning_rate": 1.0924657534246575e-05, "loss": 1.5535, "step": 2282 }, { "epoch": 0.781682002995934, "grad_norm": 0.266225129365921, "learning_rate": 1.0907534246575343e-05, "loss": 1.5161, "step": 2283 }, { "epoch": 0.7820243954632998, "grad_norm": 0.2684130370616913, "learning_rate": 1.089041095890411e-05, "loss": 1.5502, "step": 2284 }, { "epoch": 0.7823667879306655, "grad_norm": 0.3128693401813507, "learning_rate": 1.0873287671232878e-05, "loss": 1.5294, "step": 2285 }, { "epoch": 0.7827091803980313, "grad_norm": 0.29598307609558105, "learning_rate": 1.0856164383561644e-05, "loss": 1.485, "step": 2286 }, { "epoch": 0.783051572865397, "grad_norm": 0.25441867113113403, "learning_rate": 1.0839041095890412e-05, "loss": 1.5212, "step": 2287 }, { "epoch": 0.7833939653327627, "grad_norm": 0.26819828152656555, "learning_rate": 1.0821917808219179e-05, "loss": 1.6788, "step": 2288 }, { "epoch": 0.7837363578001284, "grad_norm": 0.28530386090278625, "learning_rate": 1.0804794520547946e-05, "loss": 1.4847, "step": 2289 }, { "epoch": 0.7840787502674941, "grad_norm": 0.36672860383987427, "learning_rate": 1.0787671232876714e-05, "loss": 1.5326, "step": 2290 }, { "epoch": 0.7844211427348599, "grad_norm": 0.27508091926574707, "learning_rate": 1.077054794520548e-05, "loss": 1.5647, "step": 2291 }, { "epoch": 0.7847635352022255, "grad_norm": 0.2904295027256012, "learning_rate": 1.0753424657534247e-05, "loss": 1.5724, "step": 2292 }, { "epoch": 0.7851059276695913, "grad_norm": 0.2883099913597107, "learning_rate": 1.0736301369863013e-05, "loss": 1.5589, "step": 2293 }, { "epoch": 0.785448320136957, "grad_norm": 0.2843775749206543, "learning_rate": 1.0719178082191782e-05, "loss": 1.487, "step": 2294 }, { "epoch": 0.7857907126043226, "grad_norm": 0.27028462290763855, "learning_rate": 1.0702054794520548e-05, "loss": 1.6138, "step": 2295 }, { "epoch": 0.7861331050716884, "grad_norm": 0.28762489557266235, "learning_rate": 1.0684931506849316e-05, "loss": 1.57, "step": 2296 }, { "epoch": 0.7864754975390541, "grad_norm": 0.2919502854347229, "learning_rate": 1.0667808219178083e-05, "loss": 1.5633, "step": 2297 }, { "epoch": 0.7868178900064199, "grad_norm": 0.2903449237346649, "learning_rate": 1.0650684931506851e-05, "loss": 1.5889, "step": 2298 }, { "epoch": 0.7871602824737856, "grad_norm": 0.26056796312332153, "learning_rate": 1.0633561643835618e-05, "loss": 1.5479, "step": 2299 }, { "epoch": 0.7875026749411513, "grad_norm": 0.2711414396762848, "learning_rate": 1.0616438356164384e-05, "loss": 1.6053, "step": 2300 }, { "epoch": 0.787845067408517, "grad_norm": 0.2959461212158203, "learning_rate": 1.059931506849315e-05, "loss": 1.6616, "step": 2301 }, { "epoch": 0.7881874598758827, "grad_norm": 0.26645803451538086, "learning_rate": 1.0582191780821917e-05, "loss": 1.4908, "step": 2302 }, { "epoch": 0.7885298523432485, "grad_norm": 0.2957850992679596, "learning_rate": 1.0565068493150685e-05, "loss": 1.5887, "step": 2303 }, { "epoch": 0.7888722448106141, "grad_norm": 0.2801496982574463, "learning_rate": 1.0547945205479452e-05, "loss": 1.6666, "step": 2304 }, { "epoch": 0.7892146372779799, "grad_norm": 0.2828311026096344, "learning_rate": 1.053082191780822e-05, "loss": 1.575, "step": 2305 }, { "epoch": 0.7895570297453456, "grad_norm": 0.2684567868709564, "learning_rate": 1.0513698630136987e-05, "loss": 1.5228, "step": 2306 }, { "epoch": 0.7898994222127114, "grad_norm": 0.26487264037132263, "learning_rate": 1.0496575342465755e-05, "loss": 1.6762, "step": 2307 }, { "epoch": 0.790241814680077, "grad_norm": 0.28844279050827026, "learning_rate": 1.0479452054794521e-05, "loss": 1.5577, "step": 2308 }, { "epoch": 0.7905842071474427, "grad_norm": 0.2722329795360565, "learning_rate": 1.0462328767123288e-05, "loss": 1.6079, "step": 2309 }, { "epoch": 0.7909265996148085, "grad_norm": 0.30601733922958374, "learning_rate": 1.0445205479452056e-05, "loss": 1.7172, "step": 2310 }, { "epoch": 0.7912689920821742, "grad_norm": 0.24896085262298584, "learning_rate": 1.0428082191780823e-05, "loss": 1.5596, "step": 2311 }, { "epoch": 0.7916113845495399, "grad_norm": 0.27874141931533813, "learning_rate": 1.0410958904109589e-05, "loss": 1.4945, "step": 2312 }, { "epoch": 0.7919537770169056, "grad_norm": 0.4159553647041321, "learning_rate": 1.0393835616438356e-05, "loss": 1.5399, "step": 2313 }, { "epoch": 0.7922961694842714, "grad_norm": 0.3194500207901001, "learning_rate": 1.0376712328767124e-05, "loss": 1.5361, "step": 2314 }, { "epoch": 0.7926385619516371, "grad_norm": 0.2855187952518463, "learning_rate": 1.035958904109589e-05, "loss": 1.6413, "step": 2315 }, { "epoch": 0.7929809544190027, "grad_norm": 0.27990785241127014, "learning_rate": 1.0342465753424659e-05, "loss": 1.5439, "step": 2316 }, { "epoch": 0.7933233468863685, "grad_norm": 0.28265830874443054, "learning_rate": 1.0325342465753425e-05, "loss": 1.4804, "step": 2317 }, { "epoch": 0.7936657393537342, "grad_norm": 0.31065282225608826, "learning_rate": 1.0308219178082193e-05, "loss": 1.5813, "step": 2318 }, { "epoch": 0.7940081318211, "grad_norm": 0.27228790521621704, "learning_rate": 1.029109589041096e-05, "loss": 1.5721, "step": 2319 }, { "epoch": 0.7943505242884656, "grad_norm": 0.27703380584716797, "learning_rate": 1.0273972602739726e-05, "loss": 1.4533, "step": 2320 }, { "epoch": 0.7946929167558314, "grad_norm": 0.30581364035606384, "learning_rate": 1.0256849315068495e-05, "loss": 1.6383, "step": 2321 }, { "epoch": 0.7950353092231971, "grad_norm": 0.2784107029438019, "learning_rate": 1.0239726027397261e-05, "loss": 1.5334, "step": 2322 }, { "epoch": 0.7953777016905628, "grad_norm": 0.26318255066871643, "learning_rate": 1.0222602739726028e-05, "loss": 1.5138, "step": 2323 }, { "epoch": 0.7957200941579285, "grad_norm": 0.2575206458568573, "learning_rate": 1.0205479452054794e-05, "loss": 1.4845, "step": 2324 }, { "epoch": 0.7960624866252942, "grad_norm": 0.3048146069049835, "learning_rate": 1.0188356164383562e-05, "loss": 1.5676, "step": 2325 }, { "epoch": 0.79640487909266, "grad_norm": 0.2819085121154785, "learning_rate": 1.0171232876712329e-05, "loss": 1.549, "step": 2326 }, { "epoch": 0.7967472715600257, "grad_norm": 0.2893751561641693, "learning_rate": 1.0154109589041097e-05, "loss": 1.6097, "step": 2327 }, { "epoch": 0.7970896640273913, "grad_norm": 0.28748729825019836, "learning_rate": 1.0136986301369864e-05, "loss": 1.5918, "step": 2328 }, { "epoch": 0.7974320564947571, "grad_norm": 0.28684818744659424, "learning_rate": 1.011986301369863e-05, "loss": 1.5151, "step": 2329 }, { "epoch": 0.7977744489621228, "grad_norm": 0.2853235900402069, "learning_rate": 1.0102739726027398e-05, "loss": 1.5391, "step": 2330 }, { "epoch": 0.7981168414294886, "grad_norm": 0.2546221911907196, "learning_rate": 1.0085616438356165e-05, "loss": 1.5437, "step": 2331 }, { "epoch": 0.7984592338968542, "grad_norm": 0.28969746828079224, "learning_rate": 1.0068493150684933e-05, "loss": 1.5509, "step": 2332 }, { "epoch": 0.79880162636422, "grad_norm": 0.271032452583313, "learning_rate": 1.00513698630137e-05, "loss": 1.5211, "step": 2333 }, { "epoch": 0.7991440188315857, "grad_norm": 0.2744053602218628, "learning_rate": 1.0034246575342466e-05, "loss": 1.5825, "step": 2334 }, { "epoch": 0.7994864112989514, "grad_norm": 0.28565624356269836, "learning_rate": 1.0017123287671233e-05, "loss": 1.5078, "step": 2335 }, { "epoch": 0.7998288037663172, "grad_norm": 0.2721448242664337, "learning_rate": 1e-05, "loss": 1.4899, "step": 2336 }, { "epoch": 0.8001711962336828, "grad_norm": 0.29869937896728516, "learning_rate": 9.982876712328767e-06, "loss": 1.5957, "step": 2337 }, { "epoch": 0.8005135887010486, "grad_norm": 0.2846909165382385, "learning_rate": 9.965753424657534e-06, "loss": 1.5941, "step": 2338 }, { "epoch": 0.8008559811684143, "grad_norm": 0.2869529128074646, "learning_rate": 9.948630136986302e-06, "loss": 1.5946, "step": 2339 }, { "epoch": 0.8011983736357801, "grad_norm": 0.27795442938804626, "learning_rate": 9.931506849315069e-06, "loss": 1.5323, "step": 2340 }, { "epoch": 0.8015407661031457, "grad_norm": 0.2625260055065155, "learning_rate": 9.914383561643837e-06, "loss": 1.6115, "step": 2341 }, { "epoch": 0.8018831585705114, "grad_norm": 0.26964879035949707, "learning_rate": 9.897260273972603e-06, "loss": 1.6464, "step": 2342 }, { "epoch": 0.8022255510378772, "grad_norm": 0.2583237290382385, "learning_rate": 9.88013698630137e-06, "loss": 1.6149, "step": 2343 }, { "epoch": 0.8025679435052429, "grad_norm": 0.28636133670806885, "learning_rate": 9.863013698630136e-06, "loss": 1.5297, "step": 2344 }, { "epoch": 0.8029103359726086, "grad_norm": 0.30426615476608276, "learning_rate": 9.845890410958905e-06, "loss": 1.4051, "step": 2345 }, { "epoch": 0.8032527284399743, "grad_norm": 0.275601327419281, "learning_rate": 9.828767123287671e-06, "loss": 1.5917, "step": 2346 }, { "epoch": 0.8035951209073401, "grad_norm": 0.2562064230442047, "learning_rate": 9.81164383561644e-06, "loss": 1.5682, "step": 2347 }, { "epoch": 0.8039375133747058, "grad_norm": 0.2785716950893402, "learning_rate": 9.794520547945206e-06, "loss": 1.6073, "step": 2348 }, { "epoch": 0.8042799058420714, "grad_norm": 0.3015633523464203, "learning_rate": 9.777397260273972e-06, "loss": 1.5945, "step": 2349 }, { "epoch": 0.8046222983094372, "grad_norm": 0.27351483702659607, "learning_rate": 9.76027397260274e-06, "loss": 1.4985, "step": 2350 }, { "epoch": 0.8049646907768029, "grad_norm": 0.2848553955554962, "learning_rate": 9.743150684931507e-06, "loss": 1.5715, "step": 2351 }, { "epoch": 0.8053070832441687, "grad_norm": 0.3151780664920807, "learning_rate": 9.726027397260275e-06, "loss": 1.6766, "step": 2352 }, { "epoch": 0.8056494757115343, "grad_norm": 0.2863432466983795, "learning_rate": 9.708904109589042e-06, "loss": 1.6099, "step": 2353 }, { "epoch": 0.8059918681789001, "grad_norm": 0.26185664534568787, "learning_rate": 9.691780821917808e-06, "loss": 1.5523, "step": 2354 }, { "epoch": 0.8063342606462658, "grad_norm": 0.3146619498729706, "learning_rate": 9.674657534246575e-06, "loss": 1.5813, "step": 2355 }, { "epoch": 0.8066766531136315, "grad_norm": 0.2830266058444977, "learning_rate": 9.657534246575343e-06, "loss": 1.4793, "step": 2356 }, { "epoch": 0.8070190455809972, "grad_norm": 0.2837023138999939, "learning_rate": 9.64041095890411e-06, "loss": 1.5831, "step": 2357 }, { "epoch": 0.8073614380483629, "grad_norm": 0.26653996109962463, "learning_rate": 9.623287671232876e-06, "loss": 1.5531, "step": 2358 }, { "epoch": 0.8077038305157287, "grad_norm": 0.3422829210758209, "learning_rate": 9.606164383561644e-06, "loss": 1.5169, "step": 2359 }, { "epoch": 0.8080462229830944, "grad_norm": 0.2702852785587311, "learning_rate": 9.589041095890411e-06, "loss": 1.6182, "step": 2360 }, { "epoch": 0.80838861545046, "grad_norm": 0.2932620942592621, "learning_rate": 9.571917808219179e-06, "loss": 1.527, "step": 2361 }, { "epoch": 0.8087310079178258, "grad_norm": 0.27487534284591675, "learning_rate": 9.554794520547946e-06, "loss": 1.4401, "step": 2362 }, { "epoch": 0.8090734003851915, "grad_norm": 0.27812033891677856, "learning_rate": 9.537671232876714e-06, "loss": 1.6128, "step": 2363 }, { "epoch": 0.8094157928525573, "grad_norm": 0.29173362255096436, "learning_rate": 9.52054794520548e-06, "loss": 1.6174, "step": 2364 }, { "epoch": 0.8097581853199229, "grad_norm": 0.3047679662704468, "learning_rate": 9.503424657534247e-06, "loss": 1.7311, "step": 2365 }, { "epoch": 0.8101005777872887, "grad_norm": 0.27267900109291077, "learning_rate": 9.486301369863013e-06, "loss": 1.5692, "step": 2366 }, { "epoch": 0.8104429702546544, "grad_norm": 0.26358842849731445, "learning_rate": 9.469178082191782e-06, "loss": 1.5378, "step": 2367 }, { "epoch": 0.8107853627220201, "grad_norm": 0.26083850860595703, "learning_rate": 9.452054794520548e-06, "loss": 1.6239, "step": 2368 }, { "epoch": 0.8111277551893858, "grad_norm": 0.3082348108291626, "learning_rate": 9.434931506849315e-06, "loss": 1.4275, "step": 2369 }, { "epoch": 0.8114701476567515, "grad_norm": 0.3154885768890381, "learning_rate": 9.417808219178083e-06, "loss": 1.6792, "step": 2370 }, { "epoch": 0.8118125401241173, "grad_norm": 0.2798673212528229, "learning_rate": 9.40068493150685e-06, "loss": 1.6118, "step": 2371 }, { "epoch": 0.812154932591483, "grad_norm": 0.2822222411632538, "learning_rate": 9.383561643835618e-06, "loss": 1.6418, "step": 2372 }, { "epoch": 0.8124973250588488, "grad_norm": 0.2823752760887146, "learning_rate": 9.366438356164384e-06, "loss": 1.6003, "step": 2373 }, { "epoch": 0.8128397175262144, "grad_norm": 0.2938728630542755, "learning_rate": 9.349315068493152e-06, "loss": 1.6055, "step": 2374 }, { "epoch": 0.8131821099935801, "grad_norm": 0.3054082691669464, "learning_rate": 9.332191780821919e-06, "loss": 1.5651, "step": 2375 }, { "epoch": 0.8135245024609459, "grad_norm": 0.2757856547832489, "learning_rate": 9.315068493150685e-06, "loss": 1.5843, "step": 2376 }, { "epoch": 0.8138668949283115, "grad_norm": 0.2874569296836853, "learning_rate": 9.297945205479452e-06, "loss": 1.5215, "step": 2377 }, { "epoch": 0.8142092873956773, "grad_norm": 0.26916033029556274, "learning_rate": 9.280821917808218e-06, "loss": 1.6331, "step": 2378 }, { "epoch": 0.814551679863043, "grad_norm": 0.2841404676437378, "learning_rate": 9.263698630136987e-06, "loss": 1.5719, "step": 2379 }, { "epoch": 0.8148940723304088, "grad_norm": 0.2634182870388031, "learning_rate": 9.246575342465753e-06, "loss": 1.5325, "step": 2380 }, { "epoch": 0.8152364647977745, "grad_norm": 0.27671459317207336, "learning_rate": 9.229452054794521e-06, "loss": 1.4886, "step": 2381 }, { "epoch": 0.8155788572651401, "grad_norm": 0.2711448669433594, "learning_rate": 9.212328767123288e-06, "loss": 1.5702, "step": 2382 }, { "epoch": 0.8159212497325059, "grad_norm": 0.29414102435112, "learning_rate": 9.195205479452056e-06, "loss": 1.5671, "step": 2383 }, { "epoch": 0.8162636421998716, "grad_norm": 0.2957276701927185, "learning_rate": 9.178082191780823e-06, "loss": 1.5374, "step": 2384 }, { "epoch": 0.8166060346672374, "grad_norm": 0.289636492729187, "learning_rate": 9.16095890410959e-06, "loss": 1.6618, "step": 2385 }, { "epoch": 0.816948427134603, "grad_norm": 0.351216197013855, "learning_rate": 9.143835616438356e-06, "loss": 1.4344, "step": 2386 }, { "epoch": 0.8172908196019688, "grad_norm": 0.29865968227386475, "learning_rate": 9.126712328767124e-06, "loss": 1.6614, "step": 2387 }, { "epoch": 0.8176332120693345, "grad_norm": 0.2605932354927063, "learning_rate": 9.10958904109589e-06, "loss": 1.5321, "step": 2388 }, { "epoch": 0.8179756045367002, "grad_norm": 0.26826831698417664, "learning_rate": 9.092465753424657e-06, "loss": 1.5912, "step": 2389 }, { "epoch": 0.8183179970040659, "grad_norm": 0.31067824363708496, "learning_rate": 9.075342465753425e-06, "loss": 1.498, "step": 2390 }, { "epoch": 0.8186603894714316, "grad_norm": 0.2796376645565033, "learning_rate": 9.058219178082192e-06, "loss": 1.5673, "step": 2391 }, { "epoch": 0.8190027819387974, "grad_norm": 0.27488481998443604, "learning_rate": 9.04109589041096e-06, "loss": 1.5432, "step": 2392 }, { "epoch": 0.8193451744061631, "grad_norm": 0.28566160798072815, "learning_rate": 9.023972602739726e-06, "loss": 1.4827, "step": 2393 }, { "epoch": 0.8196875668735287, "grad_norm": 0.264821857213974, "learning_rate": 9.006849315068495e-06, "loss": 1.6155, "step": 2394 }, { "epoch": 0.8200299593408945, "grad_norm": 0.2831520438194275, "learning_rate": 8.989726027397261e-06, "loss": 1.5302, "step": 2395 }, { "epoch": 0.8203723518082602, "grad_norm": 0.290748655796051, "learning_rate": 8.972602739726028e-06, "loss": 1.5005, "step": 2396 }, { "epoch": 0.820714744275626, "grad_norm": 0.28839871287345886, "learning_rate": 8.955479452054794e-06, "loss": 1.4883, "step": 2397 }, { "epoch": 0.8210571367429916, "grad_norm": 0.2712571322917938, "learning_rate": 8.93835616438356e-06, "loss": 1.5388, "step": 2398 }, { "epoch": 0.8213995292103574, "grad_norm": 0.2515392601490021, "learning_rate": 8.921232876712329e-06, "loss": 1.5711, "step": 2399 }, { "epoch": 0.8217419216777231, "grad_norm": 0.27959004044532776, "learning_rate": 8.904109589041095e-06, "loss": 1.5347, "step": 2400 }, { "epoch": 0.8220843141450888, "grad_norm": 0.3182491064071655, "learning_rate": 8.886986301369864e-06, "loss": 1.4405, "step": 2401 }, { "epoch": 0.8224267066124545, "grad_norm": 0.3001403212547302, "learning_rate": 8.86986301369863e-06, "loss": 1.5025, "step": 2402 }, { "epoch": 0.8227690990798202, "grad_norm": 0.3044440746307373, "learning_rate": 8.852739726027398e-06, "loss": 1.5946, "step": 2403 }, { "epoch": 0.823111491547186, "grad_norm": 0.30702173709869385, "learning_rate": 8.835616438356165e-06, "loss": 1.6322, "step": 2404 }, { "epoch": 0.8234538840145517, "grad_norm": 0.29559198021888733, "learning_rate": 8.818493150684933e-06, "loss": 1.538, "step": 2405 }, { "epoch": 0.8237962764819174, "grad_norm": 0.3124561011791229, "learning_rate": 8.8013698630137e-06, "loss": 1.6917, "step": 2406 }, { "epoch": 0.8241386689492831, "grad_norm": 0.27867391705513, "learning_rate": 8.784246575342466e-06, "loss": 1.5942, "step": 2407 }, { "epoch": 0.8244810614166488, "grad_norm": 0.3060586452484131, "learning_rate": 8.767123287671233e-06, "loss": 1.6681, "step": 2408 }, { "epoch": 0.8248234538840146, "grad_norm": 0.286978542804718, "learning_rate": 8.75e-06, "loss": 1.5675, "step": 2409 }, { "epoch": 0.8251658463513802, "grad_norm": 0.2721274495124817, "learning_rate": 8.732876712328767e-06, "loss": 1.6287, "step": 2410 }, { "epoch": 0.825508238818746, "grad_norm": 0.2867860198020935, "learning_rate": 8.715753424657534e-06, "loss": 1.5002, "step": 2411 }, { "epoch": 0.8258506312861117, "grad_norm": 0.289396733045578, "learning_rate": 8.698630136986302e-06, "loss": 1.6798, "step": 2412 }, { "epoch": 0.8261930237534775, "grad_norm": 0.3037697970867157, "learning_rate": 8.681506849315069e-06, "loss": 1.5574, "step": 2413 }, { "epoch": 0.8265354162208431, "grad_norm": 0.27576184272766113, "learning_rate": 8.664383561643837e-06, "loss": 1.4597, "step": 2414 }, { "epoch": 0.8268778086882088, "grad_norm": 0.2842947840690613, "learning_rate": 8.647260273972603e-06, "loss": 1.5377, "step": 2415 }, { "epoch": 0.8272202011555746, "grad_norm": 0.3064456880092621, "learning_rate": 8.630136986301372e-06, "loss": 1.634, "step": 2416 }, { "epoch": 0.8275625936229403, "grad_norm": 0.2985696494579315, "learning_rate": 8.613013698630138e-06, "loss": 1.5671, "step": 2417 }, { "epoch": 0.827904986090306, "grad_norm": 0.28877758979797363, "learning_rate": 8.595890410958905e-06, "loss": 1.5646, "step": 2418 }, { "epoch": 0.8282473785576717, "grad_norm": 0.30366963148117065, "learning_rate": 8.578767123287671e-06, "loss": 1.5925, "step": 2419 }, { "epoch": 0.8285897710250375, "grad_norm": 0.28831812739372253, "learning_rate": 8.561643835616438e-06, "loss": 1.6051, "step": 2420 }, { "epoch": 0.8289321634924032, "grad_norm": 0.293546587228775, "learning_rate": 8.544520547945206e-06, "loss": 1.5977, "step": 2421 }, { "epoch": 0.8292745559597688, "grad_norm": 0.2949022948741913, "learning_rate": 8.527397260273972e-06, "loss": 1.6096, "step": 2422 }, { "epoch": 0.8296169484271346, "grad_norm": 0.29282113909721375, "learning_rate": 8.51027397260274e-06, "loss": 1.6388, "step": 2423 }, { "epoch": 0.8299593408945003, "grad_norm": 0.27574530243873596, "learning_rate": 8.493150684931507e-06, "loss": 1.4673, "step": 2424 }, { "epoch": 0.8303017333618661, "grad_norm": 0.291195273399353, "learning_rate": 8.476027397260275e-06, "loss": 1.4798, "step": 2425 }, { "epoch": 0.8306441258292317, "grad_norm": 0.2600991427898407, "learning_rate": 8.458904109589042e-06, "loss": 1.5867, "step": 2426 }, { "epoch": 0.8309865182965974, "grad_norm": 0.2928294837474823, "learning_rate": 8.441780821917808e-06, "loss": 1.6016, "step": 2427 }, { "epoch": 0.8313289107639632, "grad_norm": 0.3020372986793518, "learning_rate": 8.424657534246575e-06, "loss": 1.5498, "step": 2428 }, { "epoch": 0.8316713032313289, "grad_norm": 0.26316481828689575, "learning_rate": 8.407534246575342e-06, "loss": 1.5645, "step": 2429 }, { "epoch": 0.8320136956986947, "grad_norm": 0.28677111864089966, "learning_rate": 8.39041095890411e-06, "loss": 1.5882, "step": 2430 }, { "epoch": 0.8323560881660603, "grad_norm": 0.2662751376628876, "learning_rate": 8.373287671232876e-06, "loss": 1.6096, "step": 2431 }, { "epoch": 0.8326984806334261, "grad_norm": 0.2724243998527527, "learning_rate": 8.356164383561644e-06, "loss": 1.5692, "step": 2432 }, { "epoch": 0.8330408731007918, "grad_norm": 0.2773453891277313, "learning_rate": 8.339041095890411e-06, "loss": 1.5304, "step": 2433 }, { "epoch": 0.8333832655681574, "grad_norm": 0.2810799777507782, "learning_rate": 8.32191780821918e-06, "loss": 1.6112, "step": 2434 }, { "epoch": 0.8337256580355232, "grad_norm": 0.27879098057746887, "learning_rate": 8.304794520547946e-06, "loss": 1.5966, "step": 2435 }, { "epoch": 0.8340680505028889, "grad_norm": 0.27627819776535034, "learning_rate": 8.287671232876714e-06, "loss": 1.5957, "step": 2436 }, { "epoch": 0.8344104429702547, "grad_norm": 0.32748961448669434, "learning_rate": 8.27054794520548e-06, "loss": 1.5752, "step": 2437 }, { "epoch": 0.8347528354376204, "grad_norm": 0.2728188931941986, "learning_rate": 8.253424657534247e-06, "loss": 1.5351, "step": 2438 }, { "epoch": 0.8350952279049861, "grad_norm": 0.2763625383377075, "learning_rate": 8.236301369863014e-06, "loss": 1.4311, "step": 2439 }, { "epoch": 0.8354376203723518, "grad_norm": 0.3152254819869995, "learning_rate": 8.21917808219178e-06, "loss": 1.6625, "step": 2440 }, { "epoch": 0.8357800128397175, "grad_norm": 0.2904893457889557, "learning_rate": 8.202054794520548e-06, "loss": 1.6098, "step": 2441 }, { "epoch": 0.8361224053070833, "grad_norm": 0.28775984048843384, "learning_rate": 8.184931506849315e-06, "loss": 1.5708, "step": 2442 }, { "epoch": 0.8364647977744489, "grad_norm": 0.2990763187408447, "learning_rate": 8.167808219178083e-06, "loss": 1.6108, "step": 2443 }, { "epoch": 0.8368071902418147, "grad_norm": 0.29336386919021606, "learning_rate": 8.15068493150685e-06, "loss": 1.5636, "step": 2444 }, { "epoch": 0.8371495827091804, "grad_norm": 0.2862493097782135, "learning_rate": 8.133561643835618e-06, "loss": 1.6019, "step": 2445 }, { "epoch": 0.8374919751765462, "grad_norm": 0.2965867519378662, "learning_rate": 8.116438356164384e-06, "loss": 1.5872, "step": 2446 }, { "epoch": 0.8378343676439118, "grad_norm": 0.2777990400791168, "learning_rate": 8.09931506849315e-06, "loss": 1.5712, "step": 2447 }, { "epoch": 0.8381767601112775, "grad_norm": 0.2791067957878113, "learning_rate": 8.082191780821919e-06, "loss": 1.5946, "step": 2448 }, { "epoch": 0.8385191525786433, "grad_norm": 0.2936023473739624, "learning_rate": 8.065068493150686e-06, "loss": 1.5986, "step": 2449 }, { "epoch": 0.838861545046009, "grad_norm": 0.2689448595046997, "learning_rate": 8.047945205479452e-06, "loss": 1.5973, "step": 2450 }, { "epoch": 0.8392039375133747, "grad_norm": 0.27585646510124207, "learning_rate": 8.030821917808219e-06, "loss": 1.5237, "step": 2451 }, { "epoch": 0.8395463299807404, "grad_norm": 0.28693535923957825, "learning_rate": 8.013698630136987e-06, "loss": 1.6636, "step": 2452 }, { "epoch": 0.8398887224481062, "grad_norm": 0.33112186193466187, "learning_rate": 7.996575342465753e-06, "loss": 1.6773, "step": 2453 }, { "epoch": 0.8402311149154719, "grad_norm": 0.27809420228004456, "learning_rate": 7.979452054794521e-06, "loss": 1.5854, "step": 2454 }, { "epoch": 0.8405735073828375, "grad_norm": 0.27660056948661804, "learning_rate": 7.962328767123288e-06, "loss": 1.6675, "step": 2455 }, { "epoch": 0.8409158998502033, "grad_norm": 0.27967968583106995, "learning_rate": 7.945205479452056e-06, "loss": 1.6171, "step": 2456 }, { "epoch": 0.841258292317569, "grad_norm": 0.2780072093009949, "learning_rate": 7.928082191780823e-06, "loss": 1.653, "step": 2457 }, { "epoch": 0.8416006847849348, "grad_norm": 0.2829362452030182, "learning_rate": 7.91095890410959e-06, "loss": 1.5904, "step": 2458 }, { "epoch": 0.8419430772523004, "grad_norm": 0.27772337198257446, "learning_rate": 7.893835616438357e-06, "loss": 1.6029, "step": 2459 }, { "epoch": 0.8422854697196661, "grad_norm": 0.2867944836616516, "learning_rate": 7.876712328767124e-06, "loss": 1.6424, "step": 2460 }, { "epoch": 0.8426278621870319, "grad_norm": 0.28686362504959106, "learning_rate": 7.85958904109589e-06, "loss": 1.5822, "step": 2461 }, { "epoch": 0.8429702546543976, "grad_norm": 0.3029434084892273, "learning_rate": 7.842465753424657e-06, "loss": 1.5868, "step": 2462 }, { "epoch": 0.8433126471217633, "grad_norm": 0.2625938951969147, "learning_rate": 7.825342465753425e-06, "loss": 1.6639, "step": 2463 }, { "epoch": 0.843655039589129, "grad_norm": 0.2748441994190216, "learning_rate": 7.808219178082192e-06, "loss": 1.6471, "step": 2464 }, { "epoch": 0.8439974320564948, "grad_norm": 0.28573843836784363, "learning_rate": 7.79109589041096e-06, "loss": 1.6759, "step": 2465 }, { "epoch": 0.8443398245238605, "grad_norm": 0.30022990703582764, "learning_rate": 7.773972602739727e-06, "loss": 1.6376, "step": 2466 }, { "epoch": 0.8446822169912261, "grad_norm": 0.2757962942123413, "learning_rate": 7.756849315068493e-06, "loss": 1.5316, "step": 2467 }, { "epoch": 0.8450246094585919, "grad_norm": 0.2642601728439331, "learning_rate": 7.739726027397261e-06, "loss": 1.4675, "step": 2468 }, { "epoch": 0.8453670019259576, "grad_norm": 0.28797611594200134, "learning_rate": 7.722602739726028e-06, "loss": 1.5631, "step": 2469 }, { "epoch": 0.8457093943933234, "grad_norm": 0.272390216588974, "learning_rate": 7.705479452054794e-06, "loss": 1.6344, "step": 2470 }, { "epoch": 0.846051786860689, "grad_norm": 0.26167386770248413, "learning_rate": 7.68835616438356e-06, "loss": 1.6132, "step": 2471 }, { "epoch": 0.8463941793280548, "grad_norm": 0.26945868134498596, "learning_rate": 7.671232876712329e-06, "loss": 1.6737, "step": 2472 }, { "epoch": 0.8467365717954205, "grad_norm": 0.2541358768939972, "learning_rate": 7.654109589041096e-06, "loss": 1.489, "step": 2473 }, { "epoch": 0.8470789642627862, "grad_norm": 0.27593350410461426, "learning_rate": 7.636986301369864e-06, "loss": 1.6174, "step": 2474 }, { "epoch": 0.847421356730152, "grad_norm": 0.26900267601013184, "learning_rate": 7.61986301369863e-06, "loss": 1.5243, "step": 2475 }, { "epoch": 0.8477637491975176, "grad_norm": 0.28284138441085815, "learning_rate": 7.6027397260273985e-06, "loss": 1.6601, "step": 2476 }, { "epoch": 0.8481061416648834, "grad_norm": 0.27448952198028564, "learning_rate": 7.585616438356165e-06, "loss": 1.6135, "step": 2477 }, { "epoch": 0.8484485341322491, "grad_norm": 0.2895243465900421, "learning_rate": 7.5684931506849316e-06, "loss": 1.5971, "step": 2478 }, { "epoch": 0.8487909265996149, "grad_norm": 0.2990325093269348, "learning_rate": 7.551369863013699e-06, "loss": 1.6597, "step": 2479 }, { "epoch": 0.8491333190669805, "grad_norm": 0.30666470527648926, "learning_rate": 7.5342465753424655e-06, "loss": 1.5451, "step": 2480 }, { "epoch": 0.8494757115343462, "grad_norm": 0.29587873816490173, "learning_rate": 7.517123287671234e-06, "loss": 1.4999, "step": 2481 }, { "epoch": 0.849818104001712, "grad_norm": 0.2831787168979645, "learning_rate": 7.5e-06, "loss": 1.652, "step": 2482 }, { "epoch": 0.8501604964690777, "grad_norm": 0.2791522145271301, "learning_rate": 7.4828767123287676e-06, "loss": 1.6434, "step": 2483 }, { "epoch": 0.8505028889364434, "grad_norm": 0.2762070596218109, "learning_rate": 7.465753424657534e-06, "loss": 1.5605, "step": 2484 }, { "epoch": 0.8508452814038091, "grad_norm": 0.2852357029914856, "learning_rate": 7.448630136986302e-06, "loss": 1.5522, "step": 2485 }, { "epoch": 0.8511876738711749, "grad_norm": 0.30146729946136475, "learning_rate": 7.431506849315069e-06, "loss": 1.5959, "step": 2486 }, { "epoch": 0.8515300663385406, "grad_norm": 0.2854650616645813, "learning_rate": 7.414383561643835e-06, "loss": 1.4635, "step": 2487 }, { "epoch": 0.8518724588059062, "grad_norm": 0.32719165086746216, "learning_rate": 7.3972602739726036e-06, "loss": 1.5916, "step": 2488 }, { "epoch": 0.852214851273272, "grad_norm": 0.3111584782600403, "learning_rate": 7.38013698630137e-06, "loss": 1.5681, "step": 2489 }, { "epoch": 0.8525572437406377, "grad_norm": 0.2816428244113922, "learning_rate": 7.3630136986301374e-06, "loss": 1.5112, "step": 2490 }, { "epoch": 0.8528996362080035, "grad_norm": 0.29916834831237793, "learning_rate": 7.345890410958904e-06, "loss": 1.5431, "step": 2491 }, { "epoch": 0.8532420286753691, "grad_norm": 0.295049250125885, "learning_rate": 7.328767123287672e-06, "loss": 1.6014, "step": 2492 }, { "epoch": 0.8535844211427348, "grad_norm": 0.2950093746185303, "learning_rate": 7.311643835616439e-06, "loss": 1.5668, "step": 2493 }, { "epoch": 0.8539268136101006, "grad_norm": 0.2767365276813507, "learning_rate": 7.294520547945206e-06, "loss": 1.5143, "step": 2494 }, { "epoch": 0.8542692060774663, "grad_norm": 0.3034258186817169, "learning_rate": 7.277397260273973e-06, "loss": 1.5521, "step": 2495 }, { "epoch": 0.854611598544832, "grad_norm": 0.28260841965675354, "learning_rate": 7.260273972602739e-06, "loss": 1.5324, "step": 2496 }, { "epoch": 0.8549539910121977, "grad_norm": 0.314768522977829, "learning_rate": 7.243150684931507e-06, "loss": 1.5686, "step": 2497 }, { "epoch": 0.8552963834795635, "grad_norm": 0.2699463367462158, "learning_rate": 7.226027397260274e-06, "loss": 1.4958, "step": 2498 }, { "epoch": 0.8556387759469292, "grad_norm": 0.27865898609161377, "learning_rate": 7.208904109589042e-06, "loss": 1.6538, "step": 2499 }, { "epoch": 0.8559811684142948, "grad_norm": 0.2697174549102783, "learning_rate": 7.191780821917809e-06, "loss": 1.6013, "step": 2500 }, { "epoch": 0.8563235608816606, "grad_norm": 0.2899375557899475, "learning_rate": 7.174657534246576e-06, "loss": 1.3965, "step": 2501 }, { "epoch": 0.8566659533490263, "grad_norm": 0.29902440309524536, "learning_rate": 7.1575342465753425e-06, "loss": 1.5982, "step": 2502 }, { "epoch": 0.8570083458163921, "grad_norm": 0.29332223534584045, "learning_rate": 7.140410958904111e-06, "loss": 1.5707, "step": 2503 }, { "epoch": 0.8573507382837577, "grad_norm": 0.278881698846817, "learning_rate": 7.123287671232877e-06, "loss": 1.5368, "step": 2504 }, { "epoch": 0.8576931307511235, "grad_norm": 0.2887871563434601, "learning_rate": 7.106164383561645e-06, "loss": 1.4815, "step": 2505 }, { "epoch": 0.8580355232184892, "grad_norm": 0.2844296991825104, "learning_rate": 7.089041095890411e-06, "loss": 1.5215, "step": 2506 }, { "epoch": 0.8583779156858549, "grad_norm": 0.2730430066585541, "learning_rate": 7.071917808219178e-06, "loss": 1.5455, "step": 2507 }, { "epoch": 0.8587203081532206, "grad_norm": 0.30348077416419983, "learning_rate": 7.054794520547946e-06, "loss": 1.5779, "step": 2508 }, { "epoch": 0.8590627006205863, "grad_norm": 0.27658215165138245, "learning_rate": 7.037671232876712e-06, "loss": 1.5175, "step": 2509 }, { "epoch": 0.8594050930879521, "grad_norm": 0.298965185880661, "learning_rate": 7.02054794520548e-06, "loss": 1.5075, "step": 2510 }, { "epoch": 0.8597474855553178, "grad_norm": 0.2963491976261139, "learning_rate": 7.003424657534246e-06, "loss": 1.4771, "step": 2511 }, { "epoch": 0.8600898780226836, "grad_norm": 0.2989712357521057, "learning_rate": 6.9863013698630145e-06, "loss": 1.5346, "step": 2512 }, { "epoch": 0.8604322704900492, "grad_norm": 0.2759726345539093, "learning_rate": 6.969178082191781e-06, "loss": 1.6041, "step": 2513 }, { "epoch": 0.8607746629574149, "grad_norm": 0.3323650062084198, "learning_rate": 6.952054794520549e-06, "loss": 1.6003, "step": 2514 }, { "epoch": 0.8611170554247807, "grad_norm": 0.275137722492218, "learning_rate": 6.934931506849316e-06, "loss": 1.5641, "step": 2515 }, { "epoch": 0.8614594478921463, "grad_norm": 0.2774914503097534, "learning_rate": 6.917808219178082e-06, "loss": 1.5492, "step": 2516 }, { "epoch": 0.8618018403595121, "grad_norm": 0.2740415930747986, "learning_rate": 6.90068493150685e-06, "loss": 1.5688, "step": 2517 }, { "epoch": 0.8621442328268778, "grad_norm": 0.3743695616722107, "learning_rate": 6.883561643835616e-06, "loss": 1.608, "step": 2518 }, { "epoch": 0.8624866252942436, "grad_norm": 0.2752639055252075, "learning_rate": 6.866438356164384e-06, "loss": 1.5894, "step": 2519 }, { "epoch": 0.8628290177616093, "grad_norm": 0.27099668979644775, "learning_rate": 6.849315068493151e-06, "loss": 1.4897, "step": 2520 }, { "epoch": 0.8631714102289749, "grad_norm": 0.2809198796749115, "learning_rate": 6.832191780821918e-06, "loss": 1.6361, "step": 2521 }, { "epoch": 0.8635138026963407, "grad_norm": 0.2782031297683716, "learning_rate": 6.815068493150685e-06, "loss": 1.625, "step": 2522 }, { "epoch": 0.8638561951637064, "grad_norm": 0.30995023250579834, "learning_rate": 6.797945205479453e-06, "loss": 1.5452, "step": 2523 }, { "epoch": 0.8641985876310722, "grad_norm": 0.2884317934513092, "learning_rate": 6.7808219178082195e-06, "loss": 1.5171, "step": 2524 }, { "epoch": 0.8645409800984378, "grad_norm": 0.2766087055206299, "learning_rate": 6.763698630136987e-06, "loss": 1.6377, "step": 2525 }, { "epoch": 0.8648833725658035, "grad_norm": 0.27411675453186035, "learning_rate": 6.746575342465753e-06, "loss": 1.6597, "step": 2526 }, { "epoch": 0.8652257650331693, "grad_norm": 0.26504042744636536, "learning_rate": 6.72945205479452e-06, "loss": 1.5453, "step": 2527 }, { "epoch": 0.865568157500535, "grad_norm": 0.3241066634654999, "learning_rate": 6.712328767123288e-06, "loss": 1.5022, "step": 2528 }, { "epoch": 0.8659105499679007, "grad_norm": 0.30528441071510315, "learning_rate": 6.695205479452055e-06, "loss": 1.6558, "step": 2529 }, { "epoch": 0.8662529424352664, "grad_norm": 0.29582348465919495, "learning_rate": 6.678082191780823e-06, "loss": 1.5975, "step": 2530 }, { "epoch": 0.8665953349026322, "grad_norm": 0.3395248353481293, "learning_rate": 6.660958904109589e-06, "loss": 1.6141, "step": 2531 }, { "epoch": 0.8669377273699979, "grad_norm": 0.30406203866004944, "learning_rate": 6.643835616438357e-06, "loss": 1.5213, "step": 2532 }, { "epoch": 0.8672801198373635, "grad_norm": 0.30311670899391174, "learning_rate": 6.626712328767123e-06, "loss": 1.6734, "step": 2533 }, { "epoch": 0.8676225123047293, "grad_norm": 0.3551381826400757, "learning_rate": 6.6095890410958915e-06, "loss": 1.6695, "step": 2534 }, { "epoch": 0.867964904772095, "grad_norm": 0.28014445304870605, "learning_rate": 6.592465753424658e-06, "loss": 1.6568, "step": 2535 }, { "epoch": 0.8683072972394608, "grad_norm": 0.2727324068546295, "learning_rate": 6.5753424657534245e-06, "loss": 1.5082, "step": 2536 }, { "epoch": 0.8686496897068264, "grad_norm": 0.276439368724823, "learning_rate": 6.558219178082192e-06, "loss": 1.6494, "step": 2537 }, { "epoch": 0.8689920821741922, "grad_norm": 0.2889153063297272, "learning_rate": 6.5410958904109584e-06, "loss": 1.5616, "step": 2538 }, { "epoch": 0.8693344746415579, "grad_norm": 0.30466294288635254, "learning_rate": 6.523972602739727e-06, "loss": 1.4643, "step": 2539 }, { "epoch": 0.8696768671089236, "grad_norm": 0.2954615354537964, "learning_rate": 6.506849315068493e-06, "loss": 1.6042, "step": 2540 }, { "epoch": 0.8700192595762893, "grad_norm": 0.2646602690219879, "learning_rate": 6.489726027397261e-06, "loss": 1.5918, "step": 2541 }, { "epoch": 0.870361652043655, "grad_norm": 0.29845431447029114, "learning_rate": 6.472602739726028e-06, "loss": 1.6565, "step": 2542 }, { "epoch": 0.8707040445110208, "grad_norm": 0.2823372185230255, "learning_rate": 6.455479452054795e-06, "loss": 1.5573, "step": 2543 }, { "epoch": 0.8710464369783865, "grad_norm": 0.34545251727104187, "learning_rate": 6.438356164383562e-06, "loss": 1.5608, "step": 2544 }, { "epoch": 0.8713888294457522, "grad_norm": 0.3119606077671051, "learning_rate": 6.42123287671233e-06, "loss": 1.6333, "step": 2545 }, { "epoch": 0.8717312219131179, "grad_norm": 0.30979984998703003, "learning_rate": 6.4041095890410965e-06, "loss": 1.5972, "step": 2546 }, { "epoch": 0.8720736143804836, "grad_norm": 0.2901623845100403, "learning_rate": 6.386986301369863e-06, "loss": 1.5606, "step": 2547 }, { "epoch": 0.8724160068478494, "grad_norm": 0.30251312255859375, "learning_rate": 6.36986301369863e-06, "loss": 1.6758, "step": 2548 }, { "epoch": 0.872758399315215, "grad_norm": 0.2842625379562378, "learning_rate": 6.352739726027397e-06, "loss": 1.594, "step": 2549 }, { "epoch": 0.8731007917825808, "grad_norm": 0.279583215713501, "learning_rate": 6.335616438356165e-06, "loss": 1.5803, "step": 2550 }, { "epoch": 0.8734431842499465, "grad_norm": 0.2993927299976349, "learning_rate": 6.318493150684932e-06, "loss": 1.528, "step": 2551 }, { "epoch": 0.8737855767173123, "grad_norm": 0.29898807406425476, "learning_rate": 6.301369863013699e-06, "loss": 1.6468, "step": 2552 }, { "epoch": 0.8741279691846779, "grad_norm": 0.2930370569229126, "learning_rate": 6.2842465753424656e-06, "loss": 1.5565, "step": 2553 }, { "epoch": 0.8744703616520436, "grad_norm": 0.2988463044166565, "learning_rate": 6.267123287671234e-06, "loss": 1.4474, "step": 2554 }, { "epoch": 0.8748127541194094, "grad_norm": 0.2812041640281677, "learning_rate": 6.25e-06, "loss": 1.5409, "step": 2555 }, { "epoch": 0.8751551465867751, "grad_norm": 0.2644698917865753, "learning_rate": 6.232876712328768e-06, "loss": 1.5035, "step": 2556 }, { "epoch": 0.8754975390541408, "grad_norm": 0.28996163606643677, "learning_rate": 6.215753424657535e-06, "loss": 1.6389, "step": 2557 }, { "epoch": 0.8758399315215065, "grad_norm": 0.2816050052642822, "learning_rate": 6.1986301369863016e-06, "loss": 1.613, "step": 2558 }, { "epoch": 0.8761823239888722, "grad_norm": 0.2782422602176666, "learning_rate": 6.181506849315069e-06, "loss": 1.5864, "step": 2559 }, { "epoch": 0.876524716456238, "grad_norm": 0.3116167485713959, "learning_rate": 6.1643835616438354e-06, "loss": 1.5534, "step": 2560 }, { "epoch": 0.8768671089236036, "grad_norm": 0.28108468651771545, "learning_rate": 6.147260273972603e-06, "loss": 1.6063, "step": 2561 }, { "epoch": 0.8772095013909694, "grad_norm": 0.3046553432941437, "learning_rate": 6.13013698630137e-06, "loss": 1.6811, "step": 2562 }, { "epoch": 0.8775518938583351, "grad_norm": 0.29486513137817383, "learning_rate": 6.1130136986301376e-06, "loss": 1.6297, "step": 2563 }, { "epoch": 0.8778942863257009, "grad_norm": 0.26889634132385254, "learning_rate": 6.095890410958904e-06, "loss": 1.5761, "step": 2564 }, { "epoch": 0.8782366787930665, "grad_norm": 0.28051847219467163, "learning_rate": 6.0787671232876714e-06, "loss": 1.5231, "step": 2565 }, { "epoch": 0.8785790712604322, "grad_norm": 0.2947452962398529, "learning_rate": 6.061643835616439e-06, "loss": 1.5507, "step": 2566 }, { "epoch": 0.878921463727798, "grad_norm": 0.3052336573600769, "learning_rate": 6.044520547945206e-06, "loss": 1.5144, "step": 2567 }, { "epoch": 0.8792638561951637, "grad_norm": 0.27917012572288513, "learning_rate": 6.027397260273973e-06, "loss": 1.6304, "step": 2568 }, { "epoch": 0.8796062486625295, "grad_norm": 0.2946491539478302, "learning_rate": 6.01027397260274e-06, "loss": 1.5724, "step": 2569 }, { "epoch": 0.8799486411298951, "grad_norm": 0.293765664100647, "learning_rate": 5.993150684931507e-06, "loss": 1.6179, "step": 2570 }, { "epoch": 0.8802910335972609, "grad_norm": 0.2931557595729828, "learning_rate": 5.976027397260274e-06, "loss": 1.5598, "step": 2571 }, { "epoch": 0.8806334260646266, "grad_norm": 0.30352339148521423, "learning_rate": 5.958904109589041e-06, "loss": 1.5908, "step": 2572 }, { "epoch": 0.8809758185319922, "grad_norm": 0.2830749750137329, "learning_rate": 5.941780821917809e-06, "loss": 1.6851, "step": 2573 }, { "epoch": 0.881318210999358, "grad_norm": 0.2731027603149414, "learning_rate": 5.924657534246575e-06, "loss": 1.4616, "step": 2574 }, { "epoch": 0.8816606034667237, "grad_norm": 0.28927358984947205, "learning_rate": 5.907534246575343e-06, "loss": 1.5831, "step": 2575 }, { "epoch": 0.8820029959340895, "grad_norm": 0.27754485607147217, "learning_rate": 5.89041095890411e-06, "loss": 1.6155, "step": 2576 }, { "epoch": 0.8823453884014552, "grad_norm": 0.32593685388565063, "learning_rate": 5.873287671232877e-06, "loss": 1.5682, "step": 2577 }, { "epoch": 0.8826877808688209, "grad_norm": 0.30415162444114685, "learning_rate": 5.856164383561645e-06, "loss": 1.6657, "step": 2578 }, { "epoch": 0.8830301733361866, "grad_norm": 0.26821184158325195, "learning_rate": 5.839041095890411e-06, "loss": 1.6125, "step": 2579 }, { "epoch": 0.8833725658035523, "grad_norm": 0.27558231353759766, "learning_rate": 5.821917808219178e-06, "loss": 1.5182, "step": 2580 }, { "epoch": 0.8837149582709181, "grad_norm": 0.281573086977005, "learning_rate": 5.804794520547945e-06, "loss": 1.552, "step": 2581 }, { "epoch": 0.8840573507382837, "grad_norm": 0.2914884686470032, "learning_rate": 5.7876712328767125e-06, "loss": 1.5598, "step": 2582 }, { "epoch": 0.8843997432056495, "grad_norm": 0.2678174078464508, "learning_rate": 5.77054794520548e-06, "loss": 1.6904, "step": 2583 }, { "epoch": 0.8847421356730152, "grad_norm": 0.28698641061782837, "learning_rate": 5.753424657534247e-06, "loss": 1.6013, "step": 2584 }, { "epoch": 0.885084528140381, "grad_norm": 0.26631227135658264, "learning_rate": 5.736301369863014e-06, "loss": 1.5394, "step": 2585 }, { "epoch": 0.8854269206077466, "grad_norm": 0.2910947799682617, "learning_rate": 5.719178082191781e-06, "loss": 1.5116, "step": 2586 }, { "epoch": 0.8857693130751123, "grad_norm": 0.2859658896923065, "learning_rate": 5.7020547945205485e-06, "loss": 1.424, "step": 2587 }, { "epoch": 0.8861117055424781, "grad_norm": 0.2806684076786041, "learning_rate": 5.684931506849316e-06, "loss": 1.5359, "step": 2588 }, { "epoch": 0.8864540980098438, "grad_norm": 0.2843486964702606, "learning_rate": 5.667808219178082e-06, "loss": 1.5082, "step": 2589 }, { "epoch": 0.8867964904772095, "grad_norm": 0.2666687071323395, "learning_rate": 5.650684931506849e-06, "loss": 1.5122, "step": 2590 }, { "epoch": 0.8871388829445752, "grad_norm": 0.26923367381095886, "learning_rate": 5.633561643835616e-06, "loss": 1.4701, "step": 2591 }, { "epoch": 0.8874812754119409, "grad_norm": 0.2873976230621338, "learning_rate": 5.616438356164384e-06, "loss": 1.6465, "step": 2592 }, { "epoch": 0.8878236678793067, "grad_norm": 0.278807669878006, "learning_rate": 5.599315068493151e-06, "loss": 1.4919, "step": 2593 }, { "epoch": 0.8881660603466723, "grad_norm": 0.3050575256347656, "learning_rate": 5.582191780821918e-06, "loss": 1.6202, "step": 2594 }, { "epoch": 0.8885084528140381, "grad_norm": 0.3115063011646271, "learning_rate": 5.565068493150685e-06, "loss": 1.6735, "step": 2595 }, { "epoch": 0.8888508452814038, "grad_norm": 0.3304365575313568, "learning_rate": 5.547945205479452e-06, "loss": 1.656, "step": 2596 }, { "epoch": 0.8891932377487696, "grad_norm": 0.2956380546092987, "learning_rate": 5.53082191780822e-06, "loss": 1.4961, "step": 2597 }, { "epoch": 0.8895356302161352, "grad_norm": 0.30414462089538574, "learning_rate": 5.513698630136987e-06, "loss": 1.6209, "step": 2598 }, { "epoch": 0.8898780226835009, "grad_norm": 0.2664282023906708, "learning_rate": 5.496575342465754e-06, "loss": 1.4874, "step": 2599 }, { "epoch": 0.8902204151508667, "grad_norm": 0.30669158697128296, "learning_rate": 5.479452054794521e-06, "loss": 1.5867, "step": 2600 }, { "epoch": 0.8905628076182324, "grad_norm": 0.2904411554336548, "learning_rate": 5.462328767123287e-06, "loss": 1.5506, "step": 2601 }, { "epoch": 0.8909052000855981, "grad_norm": 0.30497679114341736, "learning_rate": 5.445205479452055e-06, "loss": 1.5549, "step": 2602 }, { "epoch": 0.8912475925529638, "grad_norm": 0.279372900724411, "learning_rate": 5.428082191780822e-06, "loss": 1.6271, "step": 2603 }, { "epoch": 0.8915899850203296, "grad_norm": 0.2687467038631439, "learning_rate": 5.4109589041095895e-06, "loss": 1.5942, "step": 2604 }, { "epoch": 0.8919323774876953, "grad_norm": 0.2809433341026306, "learning_rate": 5.393835616438357e-06, "loss": 1.6122, "step": 2605 }, { "epoch": 0.8922747699550609, "grad_norm": 0.28006547689437866, "learning_rate": 5.376712328767123e-06, "loss": 1.5102, "step": 2606 }, { "epoch": 0.8926171624224267, "grad_norm": 0.27280035614967346, "learning_rate": 5.359589041095891e-06, "loss": 1.5748, "step": 2607 }, { "epoch": 0.8929595548897924, "grad_norm": 0.30461764335632324, "learning_rate": 5.342465753424658e-06, "loss": 1.5881, "step": 2608 }, { "epoch": 0.8933019473571582, "grad_norm": 0.27079248428344727, "learning_rate": 5.3253424657534255e-06, "loss": 1.5944, "step": 2609 }, { "epoch": 0.8936443398245238, "grad_norm": 0.2980393171310425, "learning_rate": 5.308219178082192e-06, "loss": 1.512, "step": 2610 }, { "epoch": 0.8939867322918896, "grad_norm": 0.27886825799942017, "learning_rate": 5.2910958904109585e-06, "loss": 1.7043, "step": 2611 }, { "epoch": 0.8943291247592553, "grad_norm": 0.2857617139816284, "learning_rate": 5.273972602739726e-06, "loss": 1.5247, "step": 2612 }, { "epoch": 0.894671517226621, "grad_norm": 0.29364466667175293, "learning_rate": 5.256849315068493e-06, "loss": 1.5558, "step": 2613 }, { "epoch": 0.8950139096939868, "grad_norm": 0.2836936414241791, "learning_rate": 5.239726027397261e-06, "loss": 1.6315, "step": 2614 }, { "epoch": 0.8953563021613524, "grad_norm": 0.29711514711380005, "learning_rate": 5.222602739726028e-06, "loss": 1.5506, "step": 2615 }, { "epoch": 0.8956986946287182, "grad_norm": 0.27794915437698364, "learning_rate": 5.2054794520547945e-06, "loss": 1.6323, "step": 2616 }, { "epoch": 0.8960410870960839, "grad_norm": 0.3138660490512848, "learning_rate": 5.188356164383562e-06, "loss": 1.5664, "step": 2617 }, { "epoch": 0.8963834795634497, "grad_norm": 0.2911216616630554, "learning_rate": 5.171232876712329e-06, "loss": 1.5515, "step": 2618 }, { "epoch": 0.8967258720308153, "grad_norm": 0.2908552587032318, "learning_rate": 5.154109589041097e-06, "loss": 1.5664, "step": 2619 }, { "epoch": 0.897068264498181, "grad_norm": 0.3307022154331207, "learning_rate": 5.136986301369863e-06, "loss": 1.628, "step": 2620 }, { "epoch": 0.8974106569655468, "grad_norm": 0.2910309135913849, "learning_rate": 5.1198630136986305e-06, "loss": 1.6575, "step": 2621 }, { "epoch": 0.8977530494329125, "grad_norm": 0.3172628879547119, "learning_rate": 5.102739726027397e-06, "loss": 1.6579, "step": 2622 }, { "epoch": 0.8980954419002782, "grad_norm": 0.2739066779613495, "learning_rate": 5.085616438356164e-06, "loss": 1.5762, "step": 2623 }, { "epoch": 0.8984378343676439, "grad_norm": 0.3104184567928314, "learning_rate": 5.068493150684932e-06, "loss": 1.5982, "step": 2624 }, { "epoch": 0.8987802268350096, "grad_norm": 0.29509708285331726, "learning_rate": 5.051369863013699e-06, "loss": 1.5481, "step": 2625 }, { "epoch": 0.8991226193023754, "grad_norm": 0.2765010893344879, "learning_rate": 5.0342465753424665e-06, "loss": 1.6149, "step": 2626 }, { "epoch": 0.899465011769741, "grad_norm": 0.30552637577056885, "learning_rate": 5.017123287671233e-06, "loss": 1.5854, "step": 2627 }, { "epoch": 0.8998074042371068, "grad_norm": 0.2627437710762024, "learning_rate": 5e-06, "loss": 1.5707, "step": 2628 }, { "epoch": 0.9001497967044725, "grad_norm": 0.2879520356655121, "learning_rate": 4.982876712328767e-06, "loss": 1.6046, "step": 2629 }, { "epoch": 0.9004921891718383, "grad_norm": 0.29310405254364014, "learning_rate": 4.965753424657534e-06, "loss": 1.5481, "step": 2630 }, { "epoch": 0.9008345816392039, "grad_norm": 0.3234281539916992, "learning_rate": 4.948630136986302e-06, "loss": 1.5728, "step": 2631 }, { "epoch": 0.9011769741065696, "grad_norm": 0.2850242257118225, "learning_rate": 4.931506849315068e-06, "loss": 1.634, "step": 2632 }, { "epoch": 0.9015193665739354, "grad_norm": 0.27591589093208313, "learning_rate": 4.9143835616438356e-06, "loss": 1.6032, "step": 2633 }, { "epoch": 0.9018617590413011, "grad_norm": 0.28577950596809387, "learning_rate": 4.897260273972603e-06, "loss": 1.6483, "step": 2634 }, { "epoch": 0.9022041515086668, "grad_norm": 0.25885334610939026, "learning_rate": 4.88013698630137e-06, "loss": 1.674, "step": 2635 }, { "epoch": 0.9025465439760325, "grad_norm": 0.26733458042144775, "learning_rate": 4.863013698630138e-06, "loss": 1.5627, "step": 2636 }, { "epoch": 0.9028889364433983, "grad_norm": 0.30667346715927124, "learning_rate": 4.845890410958904e-06, "loss": 1.4976, "step": 2637 }, { "epoch": 0.903231328910764, "grad_norm": 0.28562653064727783, "learning_rate": 4.8287671232876716e-06, "loss": 1.73, "step": 2638 }, { "epoch": 0.9035737213781296, "grad_norm": 0.3783906400203705, "learning_rate": 4.811643835616438e-06, "loss": 1.4947, "step": 2639 }, { "epoch": 0.9039161138454954, "grad_norm": 0.31243833899497986, "learning_rate": 4.7945205479452054e-06, "loss": 1.4598, "step": 2640 }, { "epoch": 0.9042585063128611, "grad_norm": 0.2732941210269928, "learning_rate": 4.777397260273973e-06, "loss": 1.6153, "step": 2641 }, { "epoch": 0.9046008987802269, "grad_norm": 0.2803361713886261, "learning_rate": 4.76027397260274e-06, "loss": 1.5499, "step": 2642 }, { "epoch": 0.9049432912475925, "grad_norm": 0.2777630388736725, "learning_rate": 4.743150684931507e-06, "loss": 1.5743, "step": 2643 }, { "epoch": 0.9052856837149583, "grad_norm": 0.3127155303955078, "learning_rate": 4.726027397260274e-06, "loss": 1.4892, "step": 2644 }, { "epoch": 0.905628076182324, "grad_norm": 0.2780836522579193, "learning_rate": 4.7089041095890414e-06, "loss": 1.5772, "step": 2645 }, { "epoch": 0.9059704686496897, "grad_norm": 0.3231557309627533, "learning_rate": 4.691780821917809e-06, "loss": 1.6373, "step": 2646 }, { "epoch": 0.9063128611170554, "grad_norm": 0.2895917594432831, "learning_rate": 4.674657534246576e-06, "loss": 1.6102, "step": 2647 }, { "epoch": 0.9066552535844211, "grad_norm": 0.28866860270500183, "learning_rate": 4.657534246575343e-06, "loss": 1.4907, "step": 2648 }, { "epoch": 0.9069976460517869, "grad_norm": 0.2834995985031128, "learning_rate": 4.640410958904109e-06, "loss": 1.6193, "step": 2649 }, { "epoch": 0.9073400385191526, "grad_norm": 0.2989217936992645, "learning_rate": 4.623287671232877e-06, "loss": 1.5451, "step": 2650 }, { "epoch": 0.9076824309865184, "grad_norm": 0.3123258650302887, "learning_rate": 4.606164383561644e-06, "loss": 1.5265, "step": 2651 }, { "epoch": 0.908024823453884, "grad_norm": 0.3621309697628021, "learning_rate": 4.589041095890411e-06, "loss": 1.5922, "step": 2652 }, { "epoch": 0.9083672159212497, "grad_norm": 0.28797757625579834, "learning_rate": 4.571917808219178e-06, "loss": 1.5693, "step": 2653 }, { "epoch": 0.9087096083886155, "grad_norm": 0.3118284046649933, "learning_rate": 4.554794520547945e-06, "loss": 1.5588, "step": 2654 }, { "epoch": 0.9090520008559811, "grad_norm": 0.27880924940109253, "learning_rate": 4.537671232876713e-06, "loss": 1.5682, "step": 2655 }, { "epoch": 0.9093943933233469, "grad_norm": 0.29533472657203674, "learning_rate": 4.52054794520548e-06, "loss": 1.6239, "step": 2656 }, { "epoch": 0.9097367857907126, "grad_norm": 0.2639387547969818, "learning_rate": 4.503424657534247e-06, "loss": 1.4932, "step": 2657 }, { "epoch": 0.9100791782580783, "grad_norm": 0.2874784469604492, "learning_rate": 4.486301369863014e-06, "loss": 1.5183, "step": 2658 }, { "epoch": 0.910421570725444, "grad_norm": 0.3292796313762665, "learning_rate": 4.46917808219178e-06, "loss": 1.6677, "step": 2659 }, { "epoch": 0.9107639631928097, "grad_norm": 0.28071436285972595, "learning_rate": 4.452054794520548e-06, "loss": 1.5503, "step": 2660 }, { "epoch": 0.9111063556601755, "grad_norm": 0.3041638135910034, "learning_rate": 4.434931506849315e-06, "loss": 1.6579, "step": 2661 }, { "epoch": 0.9114487481275412, "grad_norm": 0.3554707467556, "learning_rate": 4.4178082191780825e-06, "loss": 1.5161, "step": 2662 }, { "epoch": 0.911791140594907, "grad_norm": 0.2891581058502197, "learning_rate": 4.40068493150685e-06, "loss": 1.5579, "step": 2663 }, { "epoch": 0.9121335330622726, "grad_norm": 0.32679662108421326, "learning_rate": 4.383561643835616e-06, "loss": 1.6872, "step": 2664 }, { "epoch": 0.9124759255296383, "grad_norm": 0.31913894414901733, "learning_rate": 4.366438356164384e-06, "loss": 1.4417, "step": 2665 }, { "epoch": 0.9128183179970041, "grad_norm": 0.29624658823013306, "learning_rate": 4.349315068493151e-06, "loss": 1.499, "step": 2666 }, { "epoch": 0.9131607104643698, "grad_norm": 0.2991006374359131, "learning_rate": 4.3321917808219185e-06, "loss": 1.6548, "step": 2667 }, { "epoch": 0.9135031029317355, "grad_norm": 0.2910924255847931, "learning_rate": 4.315068493150686e-06, "loss": 1.5578, "step": 2668 }, { "epoch": 0.9138454953991012, "grad_norm": 0.2706972360610962, "learning_rate": 4.297945205479452e-06, "loss": 1.5601, "step": 2669 }, { "epoch": 0.914187887866467, "grad_norm": 0.3036791682243347, "learning_rate": 4.280821917808219e-06, "loss": 1.6874, "step": 2670 }, { "epoch": 0.9145302803338327, "grad_norm": 0.28908905386924744, "learning_rate": 4.263698630136986e-06, "loss": 1.5863, "step": 2671 }, { "epoch": 0.9148726728011983, "grad_norm": 0.2897023558616638, "learning_rate": 4.246575342465754e-06, "loss": 1.5804, "step": 2672 }, { "epoch": 0.9152150652685641, "grad_norm": 0.2779468297958374, "learning_rate": 4.229452054794521e-06, "loss": 1.6421, "step": 2673 }, { "epoch": 0.9155574577359298, "grad_norm": 0.29811540246009827, "learning_rate": 4.2123287671232875e-06, "loss": 1.5428, "step": 2674 }, { "epoch": 0.9158998502032956, "grad_norm": 0.3079448938369751, "learning_rate": 4.195205479452055e-06, "loss": 1.551, "step": 2675 }, { "epoch": 0.9162422426706612, "grad_norm": 0.30714842677116394, "learning_rate": 4.178082191780822e-06, "loss": 1.5471, "step": 2676 }, { "epoch": 0.916584635138027, "grad_norm": 0.2794683575630188, "learning_rate": 4.16095890410959e-06, "loss": 1.6222, "step": 2677 }, { "epoch": 0.9169270276053927, "grad_norm": 0.34502196311950684, "learning_rate": 4.143835616438357e-06, "loss": 1.7206, "step": 2678 }, { "epoch": 0.9172694200727584, "grad_norm": 0.3126720190048218, "learning_rate": 4.1267123287671235e-06, "loss": 1.5952, "step": 2679 }, { "epoch": 0.9176118125401241, "grad_norm": 0.2757822573184967, "learning_rate": 4.10958904109589e-06, "loss": 1.6275, "step": 2680 }, { "epoch": 0.9179542050074898, "grad_norm": 0.2962033152580261, "learning_rate": 4.092465753424657e-06, "loss": 1.5045, "step": 2681 }, { "epoch": 0.9182965974748556, "grad_norm": 0.27148595452308655, "learning_rate": 4.075342465753425e-06, "loss": 1.507, "step": 2682 }, { "epoch": 0.9186389899422213, "grad_norm": 0.2989894151687622, "learning_rate": 4.058219178082192e-06, "loss": 1.647, "step": 2683 }, { "epoch": 0.918981382409587, "grad_norm": 0.2737317383289337, "learning_rate": 4.0410958904109595e-06, "loss": 1.681, "step": 2684 }, { "epoch": 0.9193237748769527, "grad_norm": 0.30468061566352844, "learning_rate": 4.023972602739726e-06, "loss": 1.5559, "step": 2685 }, { "epoch": 0.9196661673443184, "grad_norm": 0.28895851969718933, "learning_rate": 4.006849315068493e-06, "loss": 1.5048, "step": 2686 }, { "epoch": 0.9200085598116842, "grad_norm": 0.2986671030521393, "learning_rate": 3.989726027397261e-06, "loss": 1.5616, "step": 2687 }, { "epoch": 0.9203509522790498, "grad_norm": 0.2791857421398163, "learning_rate": 3.972602739726028e-06, "loss": 1.5803, "step": 2688 }, { "epoch": 0.9206933447464156, "grad_norm": 0.2784462571144104, "learning_rate": 3.955479452054795e-06, "loss": 1.6253, "step": 2689 }, { "epoch": 0.9210357372137813, "grad_norm": 0.28480806946754456, "learning_rate": 3.938356164383562e-06, "loss": 1.6751, "step": 2690 }, { "epoch": 0.921378129681147, "grad_norm": 0.27573519945144653, "learning_rate": 3.9212328767123285e-06, "loss": 1.5667, "step": 2691 }, { "epoch": 0.9217205221485127, "grad_norm": 0.2937520742416382, "learning_rate": 3.904109589041096e-06, "loss": 1.4971, "step": 2692 }, { "epoch": 0.9220629146158784, "grad_norm": 0.30335748195648193, "learning_rate": 3.886986301369863e-06, "loss": 1.5243, "step": 2693 }, { "epoch": 0.9224053070832442, "grad_norm": 0.27735376358032227, "learning_rate": 3.869863013698631e-06, "loss": 1.5574, "step": 2694 }, { "epoch": 0.9227476995506099, "grad_norm": 0.2742135226726532, "learning_rate": 3.852739726027397e-06, "loss": 1.5796, "step": 2695 }, { "epoch": 0.9230900920179756, "grad_norm": 0.26642343401908875, "learning_rate": 3.8356164383561645e-06, "loss": 1.5739, "step": 2696 }, { "epoch": 0.9234324844853413, "grad_norm": 0.28072589635849, "learning_rate": 3.818493150684932e-06, "loss": 1.5707, "step": 2697 }, { "epoch": 0.923774876952707, "grad_norm": 0.27897363901138306, "learning_rate": 3.8013698630136993e-06, "loss": 1.5514, "step": 2698 }, { "epoch": 0.9241172694200728, "grad_norm": 0.31917667388916016, "learning_rate": 3.7842465753424658e-06, "loss": 1.5647, "step": 2699 }, { "epoch": 0.9244596618874384, "grad_norm": 0.2878747880458832, "learning_rate": 3.7671232876712327e-06, "loss": 1.6358, "step": 2700 }, { "epoch": 0.9248020543548042, "grad_norm": 0.28960657119750977, "learning_rate": 3.75e-06, "loss": 1.5352, "step": 2701 }, { "epoch": 0.9251444468221699, "grad_norm": 0.2876768708229065, "learning_rate": 3.732876712328767e-06, "loss": 1.5542, "step": 2702 }, { "epoch": 0.9254868392895357, "grad_norm": 0.2883352041244507, "learning_rate": 3.7157534246575344e-06, "loss": 1.4856, "step": 2703 }, { "epoch": 0.9258292317569013, "grad_norm": 0.2788362205028534, "learning_rate": 3.6986301369863018e-06, "loss": 1.4758, "step": 2704 }, { "epoch": 0.926171624224267, "grad_norm": 0.2991061508655548, "learning_rate": 3.6815068493150687e-06, "loss": 1.494, "step": 2705 }, { "epoch": 0.9265140166916328, "grad_norm": 0.2919095456600189, "learning_rate": 3.664383561643836e-06, "loss": 1.531, "step": 2706 }, { "epoch": 0.9268564091589985, "grad_norm": 0.2725207507610321, "learning_rate": 3.647260273972603e-06, "loss": 1.5403, "step": 2707 }, { "epoch": 0.9271988016263643, "grad_norm": 0.39554736018180847, "learning_rate": 3.6301369863013696e-06, "loss": 1.6246, "step": 2708 }, { "epoch": 0.9275411940937299, "grad_norm": 0.28006988763809204, "learning_rate": 3.613013698630137e-06, "loss": 1.7447, "step": 2709 }, { "epoch": 0.9278835865610957, "grad_norm": 0.2866670489311218, "learning_rate": 3.5958904109589043e-06, "loss": 1.5929, "step": 2710 }, { "epoch": 0.9282259790284614, "grad_norm": 0.29478317499160767, "learning_rate": 3.5787671232876712e-06, "loss": 1.4693, "step": 2711 }, { "epoch": 0.928568371495827, "grad_norm": 0.29566189646720886, "learning_rate": 3.5616438356164386e-06, "loss": 1.5849, "step": 2712 }, { "epoch": 0.9289107639631928, "grad_norm": 0.3210238814353943, "learning_rate": 3.5445205479452056e-06, "loss": 1.5482, "step": 2713 }, { "epoch": 0.9292531564305585, "grad_norm": 0.2898651957511902, "learning_rate": 3.527397260273973e-06, "loss": 1.522, "step": 2714 }, { "epoch": 0.9295955488979243, "grad_norm": 0.2734745144844055, "learning_rate": 3.51027397260274e-06, "loss": 1.5003, "step": 2715 }, { "epoch": 0.92993794136529, "grad_norm": 0.2733107805252075, "learning_rate": 3.4931506849315072e-06, "loss": 1.4615, "step": 2716 }, { "epoch": 0.9302803338326557, "grad_norm": 0.29594680666923523, "learning_rate": 3.4760273972602746e-06, "loss": 1.4941, "step": 2717 }, { "epoch": 0.9306227263000214, "grad_norm": 0.2911747097969055, "learning_rate": 3.458904109589041e-06, "loss": 1.5382, "step": 2718 }, { "epoch": 0.9309651187673871, "grad_norm": 0.2846525013446808, "learning_rate": 3.441780821917808e-06, "loss": 1.6113, "step": 2719 }, { "epoch": 0.9313075112347529, "grad_norm": 0.29764261841773987, "learning_rate": 3.4246575342465754e-06, "loss": 1.6837, "step": 2720 }, { "epoch": 0.9316499037021185, "grad_norm": 0.3188917934894562, "learning_rate": 3.4075342465753424e-06, "loss": 1.5622, "step": 2721 }, { "epoch": 0.9319922961694843, "grad_norm": 0.29254958033561707, "learning_rate": 3.3904109589041098e-06, "loss": 1.5231, "step": 2722 }, { "epoch": 0.93233468863685, "grad_norm": 0.30587586760520935, "learning_rate": 3.3732876712328767e-06, "loss": 1.5818, "step": 2723 }, { "epoch": 0.9326770811042157, "grad_norm": 0.2797386050224304, "learning_rate": 3.356164383561644e-06, "loss": 1.645, "step": 2724 }, { "epoch": 0.9330194735715814, "grad_norm": 0.40642842650413513, "learning_rate": 3.3390410958904114e-06, "loss": 1.5107, "step": 2725 }, { "epoch": 0.9333618660389471, "grad_norm": 0.2928674519062042, "learning_rate": 3.3219178082191784e-06, "loss": 1.6704, "step": 2726 }, { "epoch": 0.9337042585063129, "grad_norm": 0.28450292348861694, "learning_rate": 3.3047945205479457e-06, "loss": 1.513, "step": 2727 }, { "epoch": 0.9340466509736786, "grad_norm": 0.29105401039123535, "learning_rate": 3.2876712328767123e-06, "loss": 1.5959, "step": 2728 }, { "epoch": 0.9343890434410443, "grad_norm": 0.295195996761322, "learning_rate": 3.2705479452054792e-06, "loss": 1.5528, "step": 2729 }, { "epoch": 0.93473143590841, "grad_norm": 0.293002724647522, "learning_rate": 3.2534246575342466e-06, "loss": 1.603, "step": 2730 }, { "epoch": 0.9350738283757757, "grad_norm": 0.3025301694869995, "learning_rate": 3.236301369863014e-06, "loss": 1.5214, "step": 2731 }, { "epoch": 0.9354162208431415, "grad_norm": 0.2909449636936188, "learning_rate": 3.219178082191781e-06, "loss": 1.5161, "step": 2732 }, { "epoch": 0.9357586133105071, "grad_norm": 0.3163394033908844, "learning_rate": 3.2020547945205483e-06, "loss": 1.6484, "step": 2733 }, { "epoch": 0.9361010057778729, "grad_norm": 0.2747657001018524, "learning_rate": 3.184931506849315e-06, "loss": 1.541, "step": 2734 }, { "epoch": 0.9364433982452386, "grad_norm": 0.3194411098957062, "learning_rate": 3.1678082191780826e-06, "loss": 1.5488, "step": 2735 }, { "epoch": 0.9367857907126044, "grad_norm": 0.30515071749687195, "learning_rate": 3.1506849315068495e-06, "loss": 1.5161, "step": 2736 }, { "epoch": 0.93712818317997, "grad_norm": 0.283924400806427, "learning_rate": 3.133561643835617e-06, "loss": 1.6478, "step": 2737 }, { "epoch": 0.9374705756473357, "grad_norm": 0.3201298117637634, "learning_rate": 3.116438356164384e-06, "loss": 1.4907, "step": 2738 }, { "epoch": 0.9378129681147015, "grad_norm": 0.3220786154270172, "learning_rate": 3.0993150684931508e-06, "loss": 1.6201, "step": 2739 }, { "epoch": 0.9381553605820672, "grad_norm": 0.2811064124107361, "learning_rate": 3.0821917808219177e-06, "loss": 1.6179, "step": 2740 }, { "epoch": 0.938497753049433, "grad_norm": 0.2867014706134796, "learning_rate": 3.065068493150685e-06, "loss": 1.5568, "step": 2741 }, { "epoch": 0.9388401455167986, "grad_norm": 0.2978500723838806, "learning_rate": 3.047945205479452e-06, "loss": 1.5472, "step": 2742 }, { "epoch": 0.9391825379841644, "grad_norm": 0.32562074065208435, "learning_rate": 3.0308219178082194e-06, "loss": 1.5389, "step": 2743 }, { "epoch": 0.9395249304515301, "grad_norm": 0.28943490982055664, "learning_rate": 3.0136986301369864e-06, "loss": 1.6057, "step": 2744 }, { "epoch": 0.9398673229188957, "grad_norm": 0.3262709379196167, "learning_rate": 2.9965753424657533e-06, "loss": 1.5283, "step": 2745 }, { "epoch": 0.9402097153862615, "grad_norm": 0.2950226664543152, "learning_rate": 2.9794520547945207e-06, "loss": 1.6074, "step": 2746 }, { "epoch": 0.9405521078536272, "grad_norm": 0.29990899562835693, "learning_rate": 2.9623287671232876e-06, "loss": 1.4949, "step": 2747 }, { "epoch": 0.940894500320993, "grad_norm": 0.2691921293735504, "learning_rate": 2.945205479452055e-06, "loss": 1.5248, "step": 2748 }, { "epoch": 0.9412368927883586, "grad_norm": 0.28886574506759644, "learning_rate": 2.9280821917808223e-06, "loss": 1.5096, "step": 2749 }, { "epoch": 0.9415792852557243, "grad_norm": 0.29517504572868347, "learning_rate": 2.910958904109589e-06, "loss": 1.5945, "step": 2750 }, { "epoch": 0.9419216777230901, "grad_norm": 0.28736740350723267, "learning_rate": 2.8938356164383562e-06, "loss": 1.549, "step": 2751 }, { "epoch": 0.9422640701904558, "grad_norm": 0.30540230870246887, "learning_rate": 2.8767123287671236e-06, "loss": 1.6283, "step": 2752 }, { "epoch": 0.9426064626578216, "grad_norm": 0.28587108850479126, "learning_rate": 2.8595890410958905e-06, "loss": 1.5372, "step": 2753 }, { "epoch": 0.9429488551251872, "grad_norm": 0.28337278962135315, "learning_rate": 2.842465753424658e-06, "loss": 1.5959, "step": 2754 }, { "epoch": 0.943291247592553, "grad_norm": 0.2953339219093323, "learning_rate": 2.8253424657534244e-06, "loss": 1.5303, "step": 2755 }, { "epoch": 0.9436336400599187, "grad_norm": 0.2890564799308777, "learning_rate": 2.808219178082192e-06, "loss": 1.5553, "step": 2756 }, { "epoch": 0.9439760325272843, "grad_norm": 0.2849705219268799, "learning_rate": 2.791095890410959e-06, "loss": 1.4342, "step": 2757 }, { "epoch": 0.9443184249946501, "grad_norm": 0.3175230324268341, "learning_rate": 2.773972602739726e-06, "loss": 1.6, "step": 2758 }, { "epoch": 0.9446608174620158, "grad_norm": 0.31956467032432556, "learning_rate": 2.7568493150684935e-06, "loss": 1.5943, "step": 2759 }, { "epoch": 0.9450032099293816, "grad_norm": 0.2843090891838074, "learning_rate": 2.7397260273972604e-06, "loss": 1.5647, "step": 2760 }, { "epoch": 0.9453456023967473, "grad_norm": 0.29133814573287964, "learning_rate": 2.7226027397260274e-06, "loss": 1.5711, "step": 2761 }, { "epoch": 0.945687994864113, "grad_norm": 0.30285727977752686, "learning_rate": 2.7054794520547947e-06, "loss": 1.5698, "step": 2762 }, { "epoch": 0.9460303873314787, "grad_norm": 0.3001445233821869, "learning_rate": 2.6883561643835617e-06, "loss": 1.4494, "step": 2763 }, { "epoch": 0.9463727797988444, "grad_norm": 0.300663024187088, "learning_rate": 2.671232876712329e-06, "loss": 1.5682, "step": 2764 }, { "epoch": 0.9467151722662102, "grad_norm": 0.2977568507194519, "learning_rate": 2.654109589041096e-06, "loss": 1.4378, "step": 2765 }, { "epoch": 0.9470575647335758, "grad_norm": 0.27587100863456726, "learning_rate": 2.636986301369863e-06, "loss": 1.4369, "step": 2766 }, { "epoch": 0.9473999572009416, "grad_norm": 0.2881658673286438, "learning_rate": 2.6198630136986303e-06, "loss": 1.5692, "step": 2767 }, { "epoch": 0.9477423496683073, "grad_norm": 0.30449020862579346, "learning_rate": 2.6027397260273973e-06, "loss": 1.6371, "step": 2768 }, { "epoch": 0.9480847421356731, "grad_norm": 0.3133305013179779, "learning_rate": 2.5856164383561646e-06, "loss": 1.5506, "step": 2769 }, { "epoch": 0.9484271346030387, "grad_norm": 0.27027425169944763, "learning_rate": 2.5684931506849316e-06, "loss": 1.5312, "step": 2770 }, { "epoch": 0.9487695270704044, "grad_norm": 0.29796963930130005, "learning_rate": 2.5513698630136985e-06, "loss": 1.6268, "step": 2771 }, { "epoch": 0.9491119195377702, "grad_norm": 0.2704123854637146, "learning_rate": 2.534246575342466e-06, "loss": 1.477, "step": 2772 }, { "epoch": 0.9494543120051359, "grad_norm": 0.27440589666366577, "learning_rate": 2.5171232876712333e-06, "loss": 1.5697, "step": 2773 }, { "epoch": 0.9497967044725016, "grad_norm": 0.2983265221118927, "learning_rate": 2.5e-06, "loss": 1.6128, "step": 2774 }, { "epoch": 0.9501390969398673, "grad_norm": 0.299761563539505, "learning_rate": 2.482876712328767e-06, "loss": 1.6386, "step": 2775 }, { "epoch": 0.9504814894072331, "grad_norm": 0.2937566339969635, "learning_rate": 2.465753424657534e-06, "loss": 1.5279, "step": 2776 }, { "epoch": 0.9508238818745988, "grad_norm": 0.2862399220466614, "learning_rate": 2.4486301369863015e-06, "loss": 1.607, "step": 2777 }, { "epoch": 0.9511662743419644, "grad_norm": 0.3410247564315796, "learning_rate": 2.431506849315069e-06, "loss": 1.5664, "step": 2778 }, { "epoch": 0.9515086668093302, "grad_norm": 0.3330427408218384, "learning_rate": 2.4143835616438358e-06, "loss": 1.6134, "step": 2779 }, { "epoch": 0.9518510592766959, "grad_norm": 0.3181917667388916, "learning_rate": 2.3972602739726027e-06, "loss": 1.5758, "step": 2780 }, { "epoch": 0.9521934517440617, "grad_norm": 0.2544102072715759, "learning_rate": 2.38013698630137e-06, "loss": 1.4977, "step": 2781 }, { "epoch": 0.9525358442114273, "grad_norm": 0.3175451159477234, "learning_rate": 2.363013698630137e-06, "loss": 1.6456, "step": 2782 }, { "epoch": 0.952878236678793, "grad_norm": 0.2784070670604706, "learning_rate": 2.3458904109589044e-06, "loss": 1.6126, "step": 2783 }, { "epoch": 0.9532206291461588, "grad_norm": 0.37170717120170593, "learning_rate": 2.3287671232876713e-06, "loss": 1.4909, "step": 2784 }, { "epoch": 0.9535630216135245, "grad_norm": 0.2885914742946625, "learning_rate": 2.3116438356164383e-06, "loss": 1.6111, "step": 2785 }, { "epoch": 0.9539054140808902, "grad_norm": 0.30529502034187317, "learning_rate": 2.2945205479452057e-06, "loss": 1.6084, "step": 2786 }, { "epoch": 0.9542478065482559, "grad_norm": 0.3106911778450012, "learning_rate": 2.2773972602739726e-06, "loss": 1.5279, "step": 2787 }, { "epoch": 0.9545901990156217, "grad_norm": 0.31103572249412537, "learning_rate": 2.26027397260274e-06, "loss": 1.5052, "step": 2788 }, { "epoch": 0.9549325914829874, "grad_norm": 0.2951202392578125, "learning_rate": 2.243150684931507e-06, "loss": 1.5817, "step": 2789 }, { "epoch": 0.955274983950353, "grad_norm": 0.30226075649261475, "learning_rate": 2.226027397260274e-06, "loss": 1.6843, "step": 2790 }, { "epoch": 0.9556173764177188, "grad_norm": 0.3215831518173218, "learning_rate": 2.2089041095890412e-06, "loss": 1.5012, "step": 2791 }, { "epoch": 0.9559597688850845, "grad_norm": 0.2964276671409607, "learning_rate": 2.191780821917808e-06, "loss": 1.5657, "step": 2792 }, { "epoch": 0.9563021613524503, "grad_norm": 0.2872772216796875, "learning_rate": 2.1746575342465755e-06, "loss": 1.5103, "step": 2793 }, { "epoch": 0.956644553819816, "grad_norm": 0.3311094045639038, "learning_rate": 2.157534246575343e-06, "loss": 1.5484, "step": 2794 }, { "epoch": 0.9569869462871817, "grad_norm": 0.29871195554733276, "learning_rate": 2.1404109589041094e-06, "loss": 1.6068, "step": 2795 }, { "epoch": 0.9573293387545474, "grad_norm": 0.3122525215148926, "learning_rate": 2.123287671232877e-06, "loss": 1.618, "step": 2796 }, { "epoch": 0.9576717312219131, "grad_norm": 0.3092149794101715, "learning_rate": 2.1061643835616437e-06, "loss": 1.6781, "step": 2797 }, { "epoch": 0.9580141236892789, "grad_norm": 0.2852477431297302, "learning_rate": 2.089041095890411e-06, "loss": 1.5648, "step": 2798 }, { "epoch": 0.9583565161566445, "grad_norm": 0.288800448179245, "learning_rate": 2.0719178082191785e-06, "loss": 1.6013, "step": 2799 }, { "epoch": 0.9586989086240103, "grad_norm": 0.2997048795223236, "learning_rate": 2.054794520547945e-06, "loss": 1.5813, "step": 2800 }, { "epoch": 0.959041301091376, "grad_norm": 0.3055747151374817, "learning_rate": 2.0376712328767124e-06, "loss": 1.5575, "step": 2801 }, { "epoch": 0.9593836935587418, "grad_norm": 0.2919672131538391, "learning_rate": 2.0205479452054797e-06, "loss": 1.4965, "step": 2802 }, { "epoch": 0.9597260860261074, "grad_norm": 0.30920371413230896, "learning_rate": 2.0034246575342467e-06, "loss": 1.5294, "step": 2803 }, { "epoch": 0.9600684784934731, "grad_norm": 0.31715744733810425, "learning_rate": 1.986301369863014e-06, "loss": 1.5913, "step": 2804 }, { "epoch": 0.9604108709608389, "grad_norm": 0.3065647780895233, "learning_rate": 1.969178082191781e-06, "loss": 1.5046, "step": 2805 }, { "epoch": 0.9607532634282046, "grad_norm": 0.28399038314819336, "learning_rate": 1.952054794520548e-06, "loss": 1.5913, "step": 2806 }, { "epoch": 0.9610956558955703, "grad_norm": 0.2781978249549866, "learning_rate": 1.9349315068493153e-06, "loss": 1.4084, "step": 2807 }, { "epoch": 0.961438048362936, "grad_norm": 0.2831386625766754, "learning_rate": 1.9178082191780823e-06, "loss": 1.6301, "step": 2808 }, { "epoch": 0.9617804408303018, "grad_norm": 0.27196457982063293, "learning_rate": 1.9006849315068496e-06, "loss": 1.6068, "step": 2809 }, { "epoch": 0.9621228332976675, "grad_norm": 0.28601542115211487, "learning_rate": 1.8835616438356164e-06, "loss": 1.6987, "step": 2810 }, { "epoch": 0.9624652257650331, "grad_norm": 0.31995490193367004, "learning_rate": 1.8664383561643835e-06, "loss": 1.558, "step": 2811 }, { "epoch": 0.9628076182323989, "grad_norm": 0.27627360820770264, "learning_rate": 1.8493150684931509e-06, "loss": 1.5833, "step": 2812 }, { "epoch": 0.9631500106997646, "grad_norm": 0.41150882840156555, "learning_rate": 1.832191780821918e-06, "loss": 1.5867, "step": 2813 }, { "epoch": 0.9634924031671304, "grad_norm": 0.29788970947265625, "learning_rate": 1.8150684931506848e-06, "loss": 1.6316, "step": 2814 }, { "epoch": 0.963834795634496, "grad_norm": 0.27854490280151367, "learning_rate": 1.7979452054794521e-06, "loss": 1.5287, "step": 2815 }, { "epoch": 0.9641771881018617, "grad_norm": 0.2778238356113434, "learning_rate": 1.7808219178082193e-06, "loss": 1.5871, "step": 2816 }, { "epoch": 0.9645195805692275, "grad_norm": 0.2734278738498688, "learning_rate": 1.7636986301369865e-06, "loss": 1.5789, "step": 2817 }, { "epoch": 0.9648619730365932, "grad_norm": 0.290572851896286, "learning_rate": 1.7465753424657536e-06, "loss": 1.6587, "step": 2818 }, { "epoch": 0.9652043655039589, "grad_norm": 0.27608001232147217, "learning_rate": 1.7294520547945206e-06, "loss": 1.4369, "step": 2819 }, { "epoch": 0.9655467579713246, "grad_norm": 0.3081127107143402, "learning_rate": 1.7123287671232877e-06, "loss": 1.7273, "step": 2820 }, { "epoch": 0.9658891504386904, "grad_norm": 0.39625829458236694, "learning_rate": 1.6952054794520549e-06, "loss": 1.6159, "step": 2821 }, { "epoch": 0.9662315429060561, "grad_norm": 0.2877281606197357, "learning_rate": 1.678082191780822e-06, "loss": 1.5522, "step": 2822 }, { "epoch": 0.9665739353734217, "grad_norm": 0.28351131081581116, "learning_rate": 1.6609589041095892e-06, "loss": 1.4727, "step": 2823 }, { "epoch": 0.9669163278407875, "grad_norm": 0.3078128397464752, "learning_rate": 1.6438356164383561e-06, "loss": 1.6248, "step": 2824 }, { "epoch": 0.9672587203081532, "grad_norm": 0.3224639892578125, "learning_rate": 1.6267123287671233e-06, "loss": 1.7006, "step": 2825 }, { "epoch": 0.967601112775519, "grad_norm": 0.30686017870903015, "learning_rate": 1.6095890410958904e-06, "loss": 1.607, "step": 2826 }, { "epoch": 0.9679435052428846, "grad_norm": 0.2671544551849365, "learning_rate": 1.5924657534246576e-06, "loss": 1.4287, "step": 2827 }, { "epoch": 0.9682858977102504, "grad_norm": 0.2887502610683441, "learning_rate": 1.5753424657534248e-06, "loss": 1.4763, "step": 2828 }, { "epoch": 0.9686282901776161, "grad_norm": 0.2882833182811737, "learning_rate": 1.558219178082192e-06, "loss": 1.6104, "step": 2829 }, { "epoch": 0.9689706826449818, "grad_norm": 0.30787554383277893, "learning_rate": 1.5410958904109589e-06, "loss": 1.5228, "step": 2830 }, { "epoch": 0.9693130751123475, "grad_norm": 0.2894248962402344, "learning_rate": 1.523972602739726e-06, "loss": 1.6363, "step": 2831 }, { "epoch": 0.9696554675797132, "grad_norm": 0.2889362573623657, "learning_rate": 1.5068493150684932e-06, "loss": 1.6068, "step": 2832 }, { "epoch": 0.969997860047079, "grad_norm": 0.29347530007362366, "learning_rate": 1.4897260273972603e-06, "loss": 1.5488, "step": 2833 }, { "epoch": 0.9703402525144447, "grad_norm": 0.296304315328598, "learning_rate": 1.4726027397260275e-06, "loss": 1.6095, "step": 2834 }, { "epoch": 0.9706826449818104, "grad_norm": 0.2944495677947998, "learning_rate": 1.4554794520547944e-06, "loss": 1.5626, "step": 2835 }, { "epoch": 0.9710250374491761, "grad_norm": 0.2979983389377594, "learning_rate": 1.4383561643835618e-06, "loss": 1.6336, "step": 2836 }, { "epoch": 0.9713674299165418, "grad_norm": 0.26393312215805054, "learning_rate": 1.421232876712329e-06, "loss": 1.547, "step": 2837 }, { "epoch": 0.9717098223839076, "grad_norm": 0.30240896344184875, "learning_rate": 1.404109589041096e-06, "loss": 1.6233, "step": 2838 }, { "epoch": 0.9720522148512732, "grad_norm": 0.29125770926475525, "learning_rate": 1.386986301369863e-06, "loss": 1.6343, "step": 2839 }, { "epoch": 0.972394607318639, "grad_norm": 0.3213525414466858, "learning_rate": 1.3698630136986302e-06, "loss": 1.498, "step": 2840 }, { "epoch": 0.9727369997860047, "grad_norm": 0.30886170268058777, "learning_rate": 1.3527397260273974e-06, "loss": 1.6084, "step": 2841 }, { "epoch": 0.9730793922533705, "grad_norm": 0.3134315311908722, "learning_rate": 1.3356164383561645e-06, "loss": 1.5984, "step": 2842 }, { "epoch": 0.9734217847207361, "grad_norm": 0.30619025230407715, "learning_rate": 1.3184931506849315e-06, "loss": 1.4443, "step": 2843 }, { "epoch": 0.9737641771881018, "grad_norm": 0.30619996786117554, "learning_rate": 1.3013698630136986e-06, "loss": 1.682, "step": 2844 }, { "epoch": 0.9741065696554676, "grad_norm": 0.2710875868797302, "learning_rate": 1.2842465753424658e-06, "loss": 1.6205, "step": 2845 }, { "epoch": 0.9744489621228333, "grad_norm": 0.2847137451171875, "learning_rate": 1.267123287671233e-06, "loss": 1.6093, "step": 2846 }, { "epoch": 0.974791354590199, "grad_norm": 0.288016676902771, "learning_rate": 1.25e-06, "loss": 1.6216, "step": 2847 }, { "epoch": 0.9751337470575647, "grad_norm": 0.2996850907802582, "learning_rate": 1.232876712328767e-06, "loss": 1.6109, "step": 2848 }, { "epoch": 0.9754761395249304, "grad_norm": 0.2736581861972809, "learning_rate": 1.2157534246575344e-06, "loss": 1.5127, "step": 2849 }, { "epoch": 0.9758185319922962, "grad_norm": 0.28846049308776855, "learning_rate": 1.1986301369863014e-06, "loss": 1.6142, "step": 2850 }, { "epoch": 0.9761609244596618, "grad_norm": 0.2850041687488556, "learning_rate": 1.1815068493150685e-06, "loss": 1.6379, "step": 2851 }, { "epoch": 0.9765033169270276, "grad_norm": 0.3020048141479492, "learning_rate": 1.1643835616438357e-06, "loss": 1.5938, "step": 2852 }, { "epoch": 0.9768457093943933, "grad_norm": 0.3152177035808563, "learning_rate": 1.1472602739726028e-06, "loss": 1.7127, "step": 2853 }, { "epoch": 0.9771881018617591, "grad_norm": 0.28462374210357666, "learning_rate": 1.13013698630137e-06, "loss": 1.504, "step": 2854 }, { "epoch": 0.9775304943291248, "grad_norm": 0.29239895939826965, "learning_rate": 1.113013698630137e-06, "loss": 1.6047, "step": 2855 }, { "epoch": 0.9778728867964904, "grad_norm": 0.2964165508747101, "learning_rate": 1.095890410958904e-06, "loss": 1.5538, "step": 2856 }, { "epoch": 0.9782152792638562, "grad_norm": 0.2737404406070709, "learning_rate": 1.0787671232876715e-06, "loss": 1.4698, "step": 2857 }, { "epoch": 0.9785576717312219, "grad_norm": 0.28189435601234436, "learning_rate": 1.0616438356164384e-06, "loss": 1.5314, "step": 2858 }, { "epoch": 0.9789000641985877, "grad_norm": 0.3094734847545624, "learning_rate": 1.0445205479452056e-06, "loss": 1.5666, "step": 2859 }, { "epoch": 0.9792424566659533, "grad_norm": 0.29508137702941895, "learning_rate": 1.0273972602739725e-06, "loss": 1.5654, "step": 2860 }, { "epoch": 0.9795848491333191, "grad_norm": 0.2806869149208069, "learning_rate": 1.0102739726027399e-06, "loss": 1.5997, "step": 2861 }, { "epoch": 0.9799272416006848, "grad_norm": 0.3201107978820801, "learning_rate": 9.93150684931507e-07, "loss": 1.5848, "step": 2862 }, { "epoch": 0.9802696340680505, "grad_norm": 0.29138365387916565, "learning_rate": 9.76027397260274e-07, "loss": 1.5457, "step": 2863 }, { "epoch": 0.9806120265354162, "grad_norm": 0.28376883268356323, "learning_rate": 9.589041095890411e-07, "loss": 1.6157, "step": 2864 }, { "epoch": 0.9809544190027819, "grad_norm": 0.3326626420021057, "learning_rate": 9.417808219178082e-07, "loss": 1.5945, "step": 2865 }, { "epoch": 0.9812968114701477, "grad_norm": 0.3152466118335724, "learning_rate": 9.246575342465754e-07, "loss": 1.5418, "step": 2866 }, { "epoch": 0.9816392039375134, "grad_norm": 0.3179073929786682, "learning_rate": 9.075342465753424e-07, "loss": 1.5315, "step": 2867 }, { "epoch": 0.9819815964048791, "grad_norm": 0.31360816955566406, "learning_rate": 8.904109589041097e-07, "loss": 1.5132, "step": 2868 }, { "epoch": 0.9823239888722448, "grad_norm": 0.3142441511154175, "learning_rate": 8.732876712328768e-07, "loss": 1.5734, "step": 2869 }, { "epoch": 0.9826663813396105, "grad_norm": 0.2786741256713867, "learning_rate": 8.561643835616439e-07, "loss": 1.5612, "step": 2870 }, { "epoch": 0.9830087738069763, "grad_norm": 0.3078691065311432, "learning_rate": 8.39041095890411e-07, "loss": 1.5676, "step": 2871 }, { "epoch": 0.9833511662743419, "grad_norm": 0.2907184660434723, "learning_rate": 8.219178082191781e-07, "loss": 1.5662, "step": 2872 }, { "epoch": 0.9836935587417077, "grad_norm": 0.34051617980003357, "learning_rate": 8.047945205479452e-07, "loss": 1.5431, "step": 2873 }, { "epoch": 0.9840359512090734, "grad_norm": 0.29864999651908875, "learning_rate": 7.876712328767124e-07, "loss": 1.6706, "step": 2874 }, { "epoch": 0.9843783436764392, "grad_norm": 0.289304256439209, "learning_rate": 7.705479452054794e-07, "loss": 1.5752, "step": 2875 }, { "epoch": 0.9847207361438048, "grad_norm": 0.3054288923740387, "learning_rate": 7.534246575342466e-07, "loss": 1.5427, "step": 2876 }, { "epoch": 0.9850631286111705, "grad_norm": 0.27923107147216797, "learning_rate": 7.363013698630137e-07, "loss": 1.5649, "step": 2877 }, { "epoch": 0.9854055210785363, "grad_norm": 0.3726564049720764, "learning_rate": 7.191780821917809e-07, "loss": 1.5623, "step": 2878 }, { "epoch": 0.985747913545902, "grad_norm": 0.34024250507354736, "learning_rate": 7.02054794520548e-07, "loss": 1.5664, "step": 2879 }, { "epoch": 0.9860903060132677, "grad_norm": 0.3235965073108673, "learning_rate": 6.849315068493151e-07, "loss": 1.3812, "step": 2880 }, { "epoch": 0.9864326984806334, "grad_norm": 0.3224627375602722, "learning_rate": 6.678082191780823e-07, "loss": 1.5567, "step": 2881 }, { "epoch": 0.9867750909479991, "grad_norm": 0.339074969291687, "learning_rate": 6.506849315068493e-07, "loss": 1.5848, "step": 2882 }, { "epoch": 0.9871174834153649, "grad_norm": 0.2804218530654907, "learning_rate": 6.335616438356165e-07, "loss": 1.5462, "step": 2883 }, { "epoch": 0.9874598758827305, "grad_norm": 0.28601133823394775, "learning_rate": 6.164383561643835e-07, "loss": 1.5537, "step": 2884 }, { "epoch": 0.9878022683500963, "grad_norm": 0.30150896310806274, "learning_rate": 5.993150684931507e-07, "loss": 1.6188, "step": 2885 }, { "epoch": 0.988144660817462, "grad_norm": 0.2948518991470337, "learning_rate": 5.821917808219178e-07, "loss": 1.5208, "step": 2886 }, { "epoch": 0.9884870532848278, "grad_norm": 0.27431830763816833, "learning_rate": 5.65068493150685e-07, "loss": 1.5488, "step": 2887 }, { "epoch": 0.9888294457521934, "grad_norm": 0.3042480945587158, "learning_rate": 5.47945205479452e-07, "loss": 1.4398, "step": 2888 }, { "epoch": 0.9891718382195591, "grad_norm": 0.27831271290779114, "learning_rate": 5.308219178082192e-07, "loss": 1.4978, "step": 2889 }, { "epoch": 0.9895142306869249, "grad_norm": 0.2754769027233124, "learning_rate": 5.136986301369863e-07, "loss": 1.562, "step": 2890 }, { "epoch": 0.9898566231542906, "grad_norm": 0.31028273701667786, "learning_rate": 4.965753424657535e-07, "loss": 1.5123, "step": 2891 }, { "epoch": 0.9901990156216564, "grad_norm": 0.2829802334308624, "learning_rate": 4.794520547945206e-07, "loss": 1.5438, "step": 2892 }, { "epoch": 0.990541408089022, "grad_norm": 0.29211699962615967, "learning_rate": 4.623287671232877e-07, "loss": 1.6345, "step": 2893 }, { "epoch": 0.9908838005563878, "grad_norm": 0.29141947627067566, "learning_rate": 4.452054794520548e-07, "loss": 1.6275, "step": 2894 }, { "epoch": 0.9912261930237535, "grad_norm": 0.2724537253379822, "learning_rate": 4.2808219178082193e-07, "loss": 1.5524, "step": 2895 }, { "epoch": 0.9915685854911191, "grad_norm": 0.2882165312767029, "learning_rate": 4.1095890410958903e-07, "loss": 1.5367, "step": 2896 }, { "epoch": 0.9919109779584849, "grad_norm": 0.4410623610019684, "learning_rate": 3.938356164383562e-07, "loss": 1.4347, "step": 2897 }, { "epoch": 0.9922533704258506, "grad_norm": 0.28598830103874207, "learning_rate": 3.767123287671233e-07, "loss": 1.5862, "step": 2898 }, { "epoch": 0.9925957628932164, "grad_norm": 0.29110944271087646, "learning_rate": 3.5958904109589045e-07, "loss": 1.5844, "step": 2899 }, { "epoch": 0.992938155360582, "grad_norm": 0.299927681684494, "learning_rate": 3.4246575342465755e-07, "loss": 1.5862, "step": 2900 }, { "epoch": 0.9932805478279478, "grad_norm": 0.2893484830856323, "learning_rate": 3.2534246575342466e-07, "loss": 1.4934, "step": 2901 }, { "epoch": 0.9936229402953135, "grad_norm": 0.2884754240512848, "learning_rate": 3.0821917808219176e-07, "loss": 1.4821, "step": 2902 }, { "epoch": 0.9939653327626792, "grad_norm": 0.28259897232055664, "learning_rate": 2.910958904109589e-07, "loss": 1.5506, "step": 2903 }, { "epoch": 0.994307725230045, "grad_norm": 0.2988041639328003, "learning_rate": 2.73972602739726e-07, "loss": 1.5451, "step": 2904 }, { "epoch": 0.9946501176974106, "grad_norm": 0.2989497482776642, "learning_rate": 2.568493150684931e-07, "loss": 1.576, "step": 2905 }, { "epoch": 0.9949925101647764, "grad_norm": 0.3193722069263458, "learning_rate": 2.397260273972603e-07, "loss": 1.6118, "step": 2906 }, { "epoch": 0.9953349026321421, "grad_norm": 0.37605801224708557, "learning_rate": 2.226027397260274e-07, "loss": 1.6128, "step": 2907 }, { "epoch": 0.9956772950995079, "grad_norm": 0.2884008288383484, "learning_rate": 2.0547945205479452e-07, "loss": 1.5184, "step": 2908 }, { "epoch": 0.9960196875668735, "grad_norm": 0.29533830285072327, "learning_rate": 1.8835616438356165e-07, "loss": 1.7174, "step": 2909 }, { "epoch": 0.9963620800342392, "grad_norm": 0.27518898248672485, "learning_rate": 1.7123287671232878e-07, "loss": 1.6293, "step": 2910 }, { "epoch": 0.996704472501605, "grad_norm": 0.2642136812210083, "learning_rate": 1.5410958904109588e-07, "loss": 1.6004, "step": 2911 }, { "epoch": 0.9970468649689707, "grad_norm": 0.2958361506462097, "learning_rate": 1.36986301369863e-07, "loss": 1.5168, "step": 2912 }, { "epoch": 0.9973892574363364, "grad_norm": 0.3094807267189026, "learning_rate": 1.1986301369863014e-07, "loss": 1.5179, "step": 2913 }, { "epoch": 0.9977316499037021, "grad_norm": 0.2904055118560791, "learning_rate": 1.0273972602739726e-07, "loss": 1.5105, "step": 2914 }, { "epoch": 0.9980740423710678, "grad_norm": 0.3297709822654724, "learning_rate": 8.561643835616439e-08, "loss": 1.6464, "step": 2915 }, { "epoch": 0.9984164348384336, "grad_norm": 0.31224361062049866, "learning_rate": 6.84931506849315e-08, "loss": 1.6104, "step": 2916 }, { "epoch": 0.9987588273057992, "grad_norm": 0.30012333393096924, "learning_rate": 5.136986301369863e-08, "loss": 1.686, "step": 2917 }, { "epoch": 0.999101219773165, "grad_norm": 0.30467861890792847, "learning_rate": 3.424657534246575e-08, "loss": 1.5349, "step": 2918 }, { "epoch": 0.9994436122405307, "grad_norm": 0.2731279730796814, "learning_rate": 1.7123287671232876e-08, "loss": 1.6384, "step": 2919 }, { "epoch": 0.9997860047078965, "grad_norm": 0.2979719042778015, "learning_rate": 0.0, "loss": 1.6224, "step": 2920 } ], "logging_steps": 1, "max_steps": 2920, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.50394165039479e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }