| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.09586961732044419, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00031956539106814733, | |
| "grad_norm": 5.807275295257568, | |
| "learning_rate": 0.0, | |
| "loss": 5.0454, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0006391307821362947, | |
| "grad_norm": 6.20149564743042, | |
| "learning_rate": 6.369426751592357e-07, | |
| "loss": 5.1424, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0012782615642725893, | |
| "grad_norm": 5.7567291259765625, | |
| "learning_rate": 1.910828025477707e-06, | |
| "loss": 5.0835, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.001917392346408884, | |
| "grad_norm": 5.017141819000244, | |
| "learning_rate": 3.1847133757961785e-06, | |
| "loss": 5.0733, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0025565231285451786, | |
| "grad_norm": 3.2059810161590576, | |
| "learning_rate": 4.45859872611465e-06, | |
| "loss": 5.0714, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.003195653910681473, | |
| "grad_norm": 6.303244113922119, | |
| "learning_rate": 5.732484076433121e-06, | |
| "loss": 4.9915, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.003834784692817768, | |
| "grad_norm": 4.852840423583984, | |
| "learning_rate": 7.006369426751593e-06, | |
| "loss": 4.9307, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.004473915474954062, | |
| "grad_norm": 3.78067946434021, | |
| "learning_rate": 8.280254777070064e-06, | |
| "loss": 4.8924, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.005113046257090357, | |
| "grad_norm": 3.5641331672668457, | |
| "learning_rate": 9.554140127388536e-06, | |
| "loss": 4.8244, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.005752177039226652, | |
| "grad_norm": 2.191957712173462, | |
| "learning_rate": 1.0828025477707008e-05, | |
| "loss": 4.6179, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.006391307821362946, | |
| "grad_norm": 2.0675458908081055, | |
| "learning_rate": 1.2101910828025478e-05, | |
| "loss": 4.5827, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.007030438603499241, | |
| "grad_norm": 1.6559146642684937, | |
| "learning_rate": 1.337579617834395e-05, | |
| "loss": 4.4544, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.007669569385635536, | |
| "grad_norm": 1.3731284141540527, | |
| "learning_rate": 1.464968152866242e-05, | |
| "loss": 4.3748, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.00830870016777183, | |
| "grad_norm": 1.3962030410766602, | |
| "learning_rate": 1.592356687898089e-05, | |
| "loss": 4.3269, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.008947830949908125, | |
| "grad_norm": 1.2659896612167358, | |
| "learning_rate": 1.7197452229299362e-05, | |
| "loss": 4.2133, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.00958696173204442, | |
| "grad_norm": 0.9881806373596191, | |
| "learning_rate": 1.8471337579617834e-05, | |
| "loss": 4.0961, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010226092514180714, | |
| "grad_norm": 0.9945515394210815, | |
| "learning_rate": 1.974522292993631e-05, | |
| "loss": 4.0158, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01086522329631701, | |
| "grad_norm": 0.9396588802337646, | |
| "learning_rate": 2.1019108280254778e-05, | |
| "loss": 3.8763, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.011504354078453304, | |
| "grad_norm": 1.0665779113769531, | |
| "learning_rate": 2.229299363057325e-05, | |
| "loss": 3.8635, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.012143484860589597, | |
| "grad_norm": 1.077245831489563, | |
| "learning_rate": 2.356687898089172e-05, | |
| "loss": 3.802, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.012782615642725892, | |
| "grad_norm": 0.8040191531181335, | |
| "learning_rate": 2.4840764331210193e-05, | |
| "loss": 3.7284, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013421746424862187, | |
| "grad_norm": 1.4325759410858154, | |
| "learning_rate": 2.6114649681528662e-05, | |
| "loss": 3.665, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.014060877206998482, | |
| "grad_norm": 1.3450332880020142, | |
| "learning_rate": 2.7388535031847134e-05, | |
| "loss": 3.6242, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.014700007989134777, | |
| "grad_norm": 0.8203895688056946, | |
| "learning_rate": 2.8662420382165606e-05, | |
| "loss": 3.5576, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.015339138771271072, | |
| "grad_norm": 1.1661335229873657, | |
| "learning_rate": 2.9936305732484078e-05, | |
| "loss": 3.522, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.015978269553407365, | |
| "grad_norm": 1.0148671865463257, | |
| "learning_rate": 3.121019108280255e-05, | |
| "loss": 3.4594, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01661740033554366, | |
| "grad_norm": 0.6624857187271118, | |
| "learning_rate": 3.248407643312102e-05, | |
| "loss": 3.465, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.017256531117679955, | |
| "grad_norm": 0.943125307559967, | |
| "learning_rate": 3.375796178343949e-05, | |
| "loss": 3.4021, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.01789566189981625, | |
| "grad_norm": 0.9854550957679749, | |
| "learning_rate": 3.503184713375796e-05, | |
| "loss": 3.3361, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.018534792681952544, | |
| "grad_norm": 1.2242411375045776, | |
| "learning_rate": 3.630573248407643e-05, | |
| "loss": 3.3283, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.01917392346408884, | |
| "grad_norm": 0.9556372761726379, | |
| "learning_rate": 3.7579617834394906e-05, | |
| "loss": 3.2914, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.019813054246225134, | |
| "grad_norm": 1.3133809566497803, | |
| "learning_rate": 3.885350318471338e-05, | |
| "loss": 3.3126, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.02045218502836143, | |
| "grad_norm": 0.9322234392166138, | |
| "learning_rate": 4.012738853503185e-05, | |
| "loss": 3.2443, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.021091315810497724, | |
| "grad_norm": 1.4383481740951538, | |
| "learning_rate": 4.1401273885350325e-05, | |
| "loss": 3.2428, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.02173044659263402, | |
| "grad_norm": 1.0156841278076172, | |
| "learning_rate": 4.267515923566879e-05, | |
| "loss": 3.1735, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.022369577374770314, | |
| "grad_norm": 1.1754450798034668, | |
| "learning_rate": 4.394904458598726e-05, | |
| "loss": 3.1788, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02300870815690661, | |
| "grad_norm": 1.0960084199905396, | |
| "learning_rate": 4.522292993630574e-05, | |
| "loss": 3.1963, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.023647838939042903, | |
| "grad_norm": 1.054401159286499, | |
| "learning_rate": 4.6496815286624206e-05, | |
| "loss": 3.1604, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.024286969721179195, | |
| "grad_norm": 1.1957581043243408, | |
| "learning_rate": 4.777070063694268e-05, | |
| "loss": 3.1648, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.02492610050331549, | |
| "grad_norm": 0.7756203413009644, | |
| "learning_rate": 4.904458598726115e-05, | |
| "loss": 3.1066, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.025565231285451784, | |
| "grad_norm": 1.0459190607070923, | |
| "learning_rate": 5.031847133757962e-05, | |
| "loss": 3.1571, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02620436206758808, | |
| "grad_norm": 0.9746761322021484, | |
| "learning_rate": 5.159235668789809e-05, | |
| "loss": 3.1026, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.026843492849724374, | |
| "grad_norm": 1.0770882368087769, | |
| "learning_rate": 5.286624203821656e-05, | |
| "loss": 3.1125, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.02748262363186067, | |
| "grad_norm": 0.9542138576507568, | |
| "learning_rate": 5.414012738853504e-05, | |
| "loss": 3.059, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.028121754413996964, | |
| "grad_norm": 1.3454134464263916, | |
| "learning_rate": 5.5414012738853505e-05, | |
| "loss": 3.0645, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.02876088519613326, | |
| "grad_norm": 1.0354089736938477, | |
| "learning_rate": 5.6687898089171974e-05, | |
| "loss": 3.04, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.029400015978269554, | |
| "grad_norm": 1.1339548826217651, | |
| "learning_rate": 5.796178343949045e-05, | |
| "loss": 3.0625, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.03003914676040585, | |
| "grad_norm": 1.200062870979309, | |
| "learning_rate": 5.923566878980892e-05, | |
| "loss": 3.057, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.030678277542542143, | |
| "grad_norm": 1.395698070526123, | |
| "learning_rate": 6.0509554140127386e-05, | |
| "loss": 3.0341, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.031317408324678435, | |
| "grad_norm": 0.9392653703689575, | |
| "learning_rate": 6.178343949044585e-05, | |
| "loss": 3.0087, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.03195653910681473, | |
| "grad_norm": 1.1301568746566772, | |
| "learning_rate": 6.305732484076433e-05, | |
| "loss": 3.0294, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.032595669888951025, | |
| "grad_norm": 0.9571443796157837, | |
| "learning_rate": 6.43312101910828e-05, | |
| "loss": 3.0522, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.03323480067108732, | |
| "grad_norm": 0.9494081735610962, | |
| "learning_rate": 6.560509554140127e-05, | |
| "loss": 3.0012, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.033873931453223614, | |
| "grad_norm": 1.3672889471054077, | |
| "learning_rate": 6.687898089171974e-05, | |
| "loss": 3.0188, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.03451306223535991, | |
| "grad_norm": 1.2122056484222412, | |
| "learning_rate": 6.815286624203822e-05, | |
| "loss": 2.9497, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.035152193017496204, | |
| "grad_norm": 1.2184698581695557, | |
| "learning_rate": 6.942675159235669e-05, | |
| "loss": 2.9739, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0357913237996325, | |
| "grad_norm": 1.09404456615448, | |
| "learning_rate": 7.070063694267515e-05, | |
| "loss": 3.0241, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.036430454581768794, | |
| "grad_norm": 1.1653715372085571, | |
| "learning_rate": 7.197452229299363e-05, | |
| "loss": 2.9606, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.03706958536390509, | |
| "grad_norm": 1.050194501876831, | |
| "learning_rate": 7.32484076433121e-05, | |
| "loss": 2.9582, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.037708716146041384, | |
| "grad_norm": 1.1262322664260864, | |
| "learning_rate": 7.452229299363057e-05, | |
| "loss": 2.9462, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.03834784692817768, | |
| "grad_norm": 1.1232227087020874, | |
| "learning_rate": 7.579617834394906e-05, | |
| "loss": 2.9784, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03898697771031397, | |
| "grad_norm": 0.9088072776794434, | |
| "learning_rate": 7.707006369426753e-05, | |
| "loss": 2.944, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.03962610849245027, | |
| "grad_norm": 0.8985419869422913, | |
| "learning_rate": 7.834394904458599e-05, | |
| "loss": 2.9003, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.04026523927458656, | |
| "grad_norm": 1.2419854402542114, | |
| "learning_rate": 7.961783439490447e-05, | |
| "loss": 2.9753, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.04090437005672286, | |
| "grad_norm": 1.4533154964447021, | |
| "learning_rate": 8.089171974522294e-05, | |
| "loss": 2.9069, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.04154350083885915, | |
| "grad_norm": 1.475258231163025, | |
| "learning_rate": 8.21656050955414e-05, | |
| "loss": 2.9402, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04218263162099545, | |
| "grad_norm": 1.0348827838897705, | |
| "learning_rate": 8.343949044585988e-05, | |
| "loss": 2.9295, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.04282176240313174, | |
| "grad_norm": 0.9143719673156738, | |
| "learning_rate": 8.471337579617836e-05, | |
| "loss": 2.9408, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.04346089318526804, | |
| "grad_norm": 1.1310492753982544, | |
| "learning_rate": 8.598726114649682e-05, | |
| "loss": 2.875, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.04410002396740433, | |
| "grad_norm": 1.0483386516571045, | |
| "learning_rate": 8.726114649681529e-05, | |
| "loss": 2.9142, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.04473915474954063, | |
| "grad_norm": 0.921519935131073, | |
| "learning_rate": 8.853503184713377e-05, | |
| "loss": 2.9188, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04537828553167692, | |
| "grad_norm": 1.3271907567977905, | |
| "learning_rate": 8.980891719745223e-05, | |
| "loss": 2.9075, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.04601741631381322, | |
| "grad_norm": 1.7488983869552612, | |
| "learning_rate": 9.10828025477707e-05, | |
| "loss": 2.9201, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.04665654709594951, | |
| "grad_norm": 1.4263213872909546, | |
| "learning_rate": 9.235668789808918e-05, | |
| "loss": 2.9045, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.04729567787808581, | |
| "grad_norm": 0.8777288794517517, | |
| "learning_rate": 9.363057324840766e-05, | |
| "loss": 2.8959, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.047934808660222095, | |
| "grad_norm": 1.3402196168899536, | |
| "learning_rate": 9.490445859872612e-05, | |
| "loss": 2.8893, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04857393944235839, | |
| "grad_norm": 1.0943351984024048, | |
| "learning_rate": 9.617834394904459e-05, | |
| "loss": 2.9137, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.049213070224494684, | |
| "grad_norm": 1.0603907108306885, | |
| "learning_rate": 9.745222929936307e-05, | |
| "loss": 2.8677, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.04985220100663098, | |
| "grad_norm": 1.010772705078125, | |
| "learning_rate": 9.872611464968153e-05, | |
| "loss": 2.8374, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.050491331788767274, | |
| "grad_norm": 1.2628934383392334, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9009, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.05113046257090357, | |
| "grad_norm": 1.146183729171753, | |
| "learning_rate": 9.999988833687822e-05, | |
| "loss": 2.8633, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.051769593353039864, | |
| "grad_norm": 0.8704808354377747, | |
| "learning_rate": 9.99995533480116e-05, | |
| "loss": 2.8464, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.05240872413517616, | |
| "grad_norm": 1.044418454170227, | |
| "learning_rate": 9.999899503489641e-05, | |
| "loss": 2.8695, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.053047854917312454, | |
| "grad_norm": 0.833791196346283, | |
| "learning_rate": 9.999821340002636e-05, | |
| "loss": 2.8605, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.05368698569944875, | |
| "grad_norm": 0.922815203666687, | |
| "learning_rate": 9.99972084468926e-05, | |
| "loss": 2.8737, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.05432611648158504, | |
| "grad_norm": 0.9120809435844421, | |
| "learning_rate": 9.999598017998384e-05, | |
| "loss": 2.8753, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05496524726372134, | |
| "grad_norm": 1.0272431373596191, | |
| "learning_rate": 9.999452860478611e-05, | |
| "loss": 2.8907, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.05560437804585763, | |
| "grad_norm": 0.7777165174484253, | |
| "learning_rate": 9.999285372778295e-05, | |
| "loss": 2.8517, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.05624350882799393, | |
| "grad_norm": 0.7110999822616577, | |
| "learning_rate": 9.999095555645523e-05, | |
| "loss": 2.8211, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.05688263961013022, | |
| "grad_norm": 0.7857067584991455, | |
| "learning_rate": 9.998883409928117e-05, | |
| "loss": 2.8463, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.05752177039226652, | |
| "grad_norm": 0.8582798838615417, | |
| "learning_rate": 9.998648936573629e-05, | |
| "loss": 2.8197, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05816090117440281, | |
| "grad_norm": 0.9790541529655457, | |
| "learning_rate": 9.998392136629345e-05, | |
| "loss": 2.8193, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.05880003195653911, | |
| "grad_norm": 1.1599719524383545, | |
| "learning_rate": 9.998113011242264e-05, | |
| "loss": 2.8206, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.0594391627386754, | |
| "grad_norm": 0.8326631188392639, | |
| "learning_rate": 9.99781156165911e-05, | |
| "loss": 2.8349, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.0600782935208117, | |
| "grad_norm": 0.8876377940177917, | |
| "learning_rate": 9.997487789226312e-05, | |
| "loss": 2.8225, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.06071742430294799, | |
| "grad_norm": 0.9899202585220337, | |
| "learning_rate": 9.997141695390009e-05, | |
| "loss": 2.7875, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06135655508508429, | |
| "grad_norm": 1.0686557292938232, | |
| "learning_rate": 9.996773281696037e-05, | |
| "loss": 2.8024, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.06199568586722058, | |
| "grad_norm": 0.8899752497673035, | |
| "learning_rate": 9.996382549789926e-05, | |
| "loss": 2.8225, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.06263481664935687, | |
| "grad_norm": 0.7781797647476196, | |
| "learning_rate": 9.995969501416891e-05, | |
| "loss": 2.8046, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.06327394743149317, | |
| "grad_norm": 0.6428512930870056, | |
| "learning_rate": 9.995534138421818e-05, | |
| "loss": 2.7693, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.06391307821362946, | |
| "grad_norm": 0.7047809958457947, | |
| "learning_rate": 9.995076462749273e-05, | |
| "loss": 2.766, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06455220899576576, | |
| "grad_norm": 0.6256312131881714, | |
| "learning_rate": 9.99459647644347e-05, | |
| "loss": 2.8071, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.06519133977790205, | |
| "grad_norm": 0.699400007724762, | |
| "learning_rate": 9.994094181648283e-05, | |
| "loss": 2.8347, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.06583047056003835, | |
| "grad_norm": 0.7256817817687988, | |
| "learning_rate": 9.993569580607225e-05, | |
| "loss": 2.8074, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.06646960134217464, | |
| "grad_norm": 0.573846161365509, | |
| "learning_rate": 9.993022675663437e-05, | |
| "loss": 2.7413, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.06710873212431094, | |
| "grad_norm": 0.7314406037330627, | |
| "learning_rate": 9.992453469259685e-05, | |
| "loss": 2.7983, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06774786290644723, | |
| "grad_norm": 0.7307546734809875, | |
| "learning_rate": 9.991861963938342e-05, | |
| "loss": 2.8026, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.06838699368858353, | |
| "grad_norm": 0.6367102861404419, | |
| "learning_rate": 9.991248162341384e-05, | |
| "loss": 2.7424, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.06902612447071982, | |
| "grad_norm": 0.8630378246307373, | |
| "learning_rate": 9.99061206721037e-05, | |
| "loss": 2.7395, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.06966525525285612, | |
| "grad_norm": 0.7586290240287781, | |
| "learning_rate": 9.989953681386433e-05, | |
| "loss": 2.7624, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.07030438603499241, | |
| "grad_norm": 0.7091168761253357, | |
| "learning_rate": 9.989273007810271e-05, | |
| "loss": 2.7719, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07094351681712871, | |
| "grad_norm": 0.684183657169342, | |
| "learning_rate": 9.98857004952213e-05, | |
| "loss": 2.7806, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.071582647599265, | |
| "grad_norm": 0.920498788356781, | |
| "learning_rate": 9.987844809661791e-05, | |
| "loss": 2.7626, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.0722217783814013, | |
| "grad_norm": 0.730060875415802, | |
| "learning_rate": 9.987097291468552e-05, | |
| "loss": 2.8107, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.07286090916353759, | |
| "grad_norm": 0.8606828451156616, | |
| "learning_rate": 9.986327498281227e-05, | |
| "loss": 2.7814, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.07350003994567389, | |
| "grad_norm": 0.8068298101425171, | |
| "learning_rate": 9.985535433538113e-05, | |
| "loss": 2.7775, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07413917072781018, | |
| "grad_norm": 0.6887542009353638, | |
| "learning_rate": 9.984721100776989e-05, | |
| "loss": 2.784, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.07477830150994648, | |
| "grad_norm": 0.84773850440979, | |
| "learning_rate": 9.98388450363509e-05, | |
| "loss": 2.7333, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.07541743229208277, | |
| "grad_norm": 0.7914923429489136, | |
| "learning_rate": 9.9830256458491e-05, | |
| "loss": 2.7363, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.07605656307421906, | |
| "grad_norm": 0.8284217715263367, | |
| "learning_rate": 9.982144531255127e-05, | |
| "loss": 2.7389, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.07669569385635536, | |
| "grad_norm": 0.7706480622291565, | |
| "learning_rate": 9.981241163788694e-05, | |
| "loss": 2.7377, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.07733482463849164, | |
| "grad_norm": 0.6147120594978333, | |
| "learning_rate": 9.980315547484711e-05, | |
| "loss": 2.7862, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.07797395542062795, | |
| "grad_norm": 0.6364494562149048, | |
| "learning_rate": 9.979367686477469e-05, | |
| "loss": 2.762, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.07861308620276423, | |
| "grad_norm": 0.6944818496704102, | |
| "learning_rate": 9.978397585000611e-05, | |
| "loss": 2.7624, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.07925221698490054, | |
| "grad_norm": 1.2648204565048218, | |
| "learning_rate": 9.977405247387119e-05, | |
| "loss": 2.7544, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.07989134776703682, | |
| "grad_norm": 1.0054659843444824, | |
| "learning_rate": 9.976390678069295e-05, | |
| "loss": 2.7523, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08053047854917313, | |
| "grad_norm": 0.715492308139801, | |
| "learning_rate": 9.975353881578738e-05, | |
| "loss": 2.7341, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.08116960933130941, | |
| "grad_norm": 0.7963582277297974, | |
| "learning_rate": 9.974294862546325e-05, | |
| "loss": 2.7484, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.08180874011344572, | |
| "grad_norm": 0.7069251537322998, | |
| "learning_rate": 9.97321362570219e-05, | |
| "loss": 2.7719, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.082447870895582, | |
| "grad_norm": 0.5716209411621094, | |
| "learning_rate": 9.972110175875706e-05, | |
| "loss": 2.8079, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.0830870016777183, | |
| "grad_norm": 0.65562903881073, | |
| "learning_rate": 9.970984517995456e-05, | |
| "loss": 2.7642, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0837261324598546, | |
| "grad_norm": 0.647085964679718, | |
| "learning_rate": 9.969836657089225e-05, | |
| "loss": 2.7139, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.0843652632419909, | |
| "grad_norm": 0.6401609778404236, | |
| "learning_rate": 9.968666598283955e-05, | |
| "loss": 2.7278, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.08500439402412718, | |
| "grad_norm": 0.5514021515846252, | |
| "learning_rate": 9.967474346805746e-05, | |
| "loss": 2.7332, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.08564352480626349, | |
| "grad_norm": 0.5908826589584351, | |
| "learning_rate": 9.96625990797982e-05, | |
| "loss": 2.741, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.08628265558839977, | |
| "grad_norm": 0.5510653853416443, | |
| "learning_rate": 9.965023287230497e-05, | |
| "loss": 2.7025, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.08692178637053607, | |
| "grad_norm": 0.5656317472457886, | |
| "learning_rate": 9.963764490081176e-05, | |
| "loss": 2.7184, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.08756091715267236, | |
| "grad_norm": 0.5132441520690918, | |
| "learning_rate": 9.962483522154302e-05, | |
| "loss": 2.7632, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.08820004793480866, | |
| "grad_norm": 0.6730588674545288, | |
| "learning_rate": 9.961180389171352e-05, | |
| "loss": 2.7705, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.08883917871694495, | |
| "grad_norm": 0.5657472610473633, | |
| "learning_rate": 9.959855096952804e-05, | |
| "loss": 2.7191, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.08947830949908125, | |
| "grad_norm": 0.8265955448150635, | |
| "learning_rate": 9.958507651418106e-05, | |
| "loss": 2.7718, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.09011744028121754, | |
| "grad_norm": 0.8996996879577637, | |
| "learning_rate": 9.957138058585658e-05, | |
| "loss": 2.7124, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.09075657106335384, | |
| "grad_norm": 0.6458889842033386, | |
| "learning_rate": 9.955746324572781e-05, | |
| "loss": 2.7403, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.09139570184549013, | |
| "grad_norm": 0.7175470590591431, | |
| "learning_rate": 9.954332455595689e-05, | |
| "loss": 2.7188, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.09203483262762643, | |
| "grad_norm": 0.6640183329582214, | |
| "learning_rate": 9.952896457969463e-05, | |
| "loss": 2.7223, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.09267396340976272, | |
| "grad_norm": 0.6551202535629272, | |
| "learning_rate": 9.951438338108022e-05, | |
| "loss": 2.7189, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09331309419189902, | |
| "grad_norm": 0.6980673670768738, | |
| "learning_rate": 9.949958102524093e-05, | |
| "loss": 2.7183, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.09395222497403531, | |
| "grad_norm": 0.5926324129104614, | |
| "learning_rate": 9.948455757829187e-05, | |
| "loss": 2.7476, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.09459135575617161, | |
| "grad_norm": 0.5434746742248535, | |
| "learning_rate": 9.946931310733565e-05, | |
| "loss": 2.7368, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.0952304865383079, | |
| "grad_norm": 0.6466372609138489, | |
| "learning_rate": 9.945384768046206e-05, | |
| "loss": 2.7307, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.09586961732044419, | |
| "grad_norm": 0.6376985311508179, | |
| "learning_rate": 9.943816136674782e-05, | |
| "loss": 2.7239, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 3130, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.377550336196608e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |