{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12380706732009286, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00041269022440030954, "grad_norm": 5.357641696929932, "learning_rate": 0.0, "loss": 6.1189, "step": 1 }, { "epoch": 0.0008253804488006191, "grad_norm": 5.467103004455566, "learning_rate": 8.19672131147541e-07, "loss": 6.1624, "step": 2 }, { "epoch": 0.0016507608976012382, "grad_norm": 5.270254611968994, "learning_rate": 2.459016393442623e-06, "loss": 6.1301, "step": 4 }, { "epoch": 0.002476141346401857, "grad_norm": 4.018209457397461, "learning_rate": 4.098360655737704e-06, "loss": 6.1305, "step": 6 }, { "epoch": 0.0033015217952024763, "grad_norm": 3.973541259765625, "learning_rate": 5.737704918032787e-06, "loss": 6.0095, "step": 8 }, { "epoch": 0.0041269022440030955, "grad_norm": 3.5741689205169678, "learning_rate": 7.3770491803278695e-06, "loss": 5.9256, "step": 10 }, { "epoch": 0.004952282692803714, "grad_norm": 4.449726581573486, "learning_rate": 9.016393442622952e-06, "loss": 5.8059, "step": 12 }, { "epoch": 0.005777663141604333, "grad_norm": 4.23346471786499, "learning_rate": 1.0655737704918032e-05, "loss": 5.6952, "step": 14 }, { "epoch": 0.006603043590404953, "grad_norm": 2.9237682819366455, "learning_rate": 1.2295081967213116e-05, "loss": 5.5323, "step": 16 }, { "epoch": 0.007428424039205571, "grad_norm": 2.1714608669281006, "learning_rate": 1.3934426229508196e-05, "loss": 5.3692, "step": 18 }, { "epoch": 0.008253804488006191, "grad_norm": 1.9534616470336914, "learning_rate": 1.557377049180328e-05, "loss": 5.1891, "step": 20 }, { "epoch": 0.00907918493680681, "grad_norm": 1.7847157716751099, "learning_rate": 1.721311475409836e-05, "loss": 5.0308, "step": 22 }, { "epoch": 0.009904565385607429, "grad_norm": 1.329651117324829, "learning_rate": 1.8852459016393442e-05, "loss": 4.8984, "step": 24 }, { "epoch": 0.010729945834408047, "grad_norm": 1.3483397960662842, "learning_rate": 2.0491803278688525e-05, "loss": 4.7523, "step": 26 }, { "epoch": 0.011555326283208666, "grad_norm": 1.213932991027832, "learning_rate": 2.2131147540983607e-05, "loss": 4.6048, "step": 28 }, { "epoch": 0.012380706732009285, "grad_norm": 1.2783870697021484, "learning_rate": 2.377049180327869e-05, "loss": 4.5203, "step": 30 }, { "epoch": 0.013206087180809905, "grad_norm": 1.4986013174057007, "learning_rate": 2.540983606557377e-05, "loss": 4.3679, "step": 32 }, { "epoch": 0.014031467629610524, "grad_norm": 0.9324406981468201, "learning_rate": 2.7049180327868856e-05, "loss": 4.2537, "step": 34 }, { "epoch": 0.014856848078411143, "grad_norm": 2.258493185043335, "learning_rate": 2.8688524590163935e-05, "loss": 4.1994, "step": 36 }, { "epoch": 0.015682228527211763, "grad_norm": 2.1433424949645996, "learning_rate": 3.0327868852459017e-05, "loss": 4.1349, "step": 38 }, { "epoch": 0.016507608976012382, "grad_norm": 2.0129520893096924, "learning_rate": 3.19672131147541e-05, "loss": 4.0318, "step": 40 }, { "epoch": 0.017332989424813, "grad_norm": 1.2179840803146362, "learning_rate": 3.360655737704918e-05, "loss": 3.9541, "step": 42 }, { "epoch": 0.01815836987361362, "grad_norm": 1.133366346359253, "learning_rate": 3.524590163934427e-05, "loss": 3.8931, "step": 44 }, { "epoch": 0.01898375032241424, "grad_norm": 1.4476598501205444, "learning_rate": 3.6885245901639346e-05, "loss": 3.8386, "step": 46 }, { "epoch": 0.019809130771214857, "grad_norm": 1.8356342315673828, "learning_rate": 3.8524590163934424e-05, "loss": 3.7593, "step": 48 }, { "epoch": 0.020634511220015476, "grad_norm": 1.1986051797866821, "learning_rate": 4.016393442622951e-05, "loss": 3.7367, "step": 50 }, { "epoch": 0.021459891668816095, "grad_norm": 1.4108922481536865, "learning_rate": 4.1803278688524595e-05, "loss": 3.6536, "step": 52 }, { "epoch": 0.022285272117616713, "grad_norm": 1.194887638092041, "learning_rate": 4.3442622950819674e-05, "loss": 3.6377, "step": 54 }, { "epoch": 0.023110652566417332, "grad_norm": 1.5970392227172852, "learning_rate": 4.508196721311476e-05, "loss": 3.592, "step": 56 }, { "epoch": 0.02393603301521795, "grad_norm": 1.7871198654174805, "learning_rate": 4.672131147540984e-05, "loss": 3.5467, "step": 58 }, { "epoch": 0.02476141346401857, "grad_norm": 2.0405406951904297, "learning_rate": 4.836065573770492e-05, "loss": 3.5071, "step": 60 }, { "epoch": 0.025586793912819192, "grad_norm": 1.6245758533477783, "learning_rate": 5e-05, "loss": 3.4754, "step": 62 }, { "epoch": 0.02641217436161981, "grad_norm": 1.3766052722930908, "learning_rate": 5.163934426229509e-05, "loss": 3.4631, "step": 64 }, { "epoch": 0.02723755481042043, "grad_norm": 1.3058711290359497, "learning_rate": 5.327868852459017e-05, "loss": 3.4197, "step": 66 }, { "epoch": 0.028062935259221048, "grad_norm": 1.545015573501587, "learning_rate": 5.491803278688525e-05, "loss": 3.4313, "step": 68 }, { "epoch": 0.028888315708021667, "grad_norm": 1.439721703529358, "learning_rate": 5.6557377049180324e-05, "loss": 3.3894, "step": 70 }, { "epoch": 0.029713696156822286, "grad_norm": 1.6934937238693237, "learning_rate": 5.819672131147541e-05, "loss": 3.3193, "step": 72 }, { "epoch": 0.030539076605622904, "grad_norm": 1.3454101085662842, "learning_rate": 5.9836065573770495e-05, "loss": 3.3252, "step": 74 }, { "epoch": 0.03136445705442353, "grad_norm": 1.610787272453308, "learning_rate": 6.147540983606557e-05, "loss": 3.2966, "step": 76 }, { "epoch": 0.03218983750322414, "grad_norm": 2.0271148681640625, "learning_rate": 6.311475409836067e-05, "loss": 3.2892, "step": 78 }, { "epoch": 0.033015217952024764, "grad_norm": 2.165980100631714, "learning_rate": 6.475409836065574e-05, "loss": 3.2792, "step": 80 }, { "epoch": 0.03384059840082538, "grad_norm": 1.7957913875579834, "learning_rate": 6.639344262295082e-05, "loss": 3.2543, "step": 82 }, { "epoch": 0.034665978849626, "grad_norm": 1.3472362756729126, "learning_rate": 6.80327868852459e-05, "loss": 3.251, "step": 84 }, { "epoch": 0.03549135929842662, "grad_norm": 1.3264447450637817, "learning_rate": 6.967213114754098e-05, "loss": 3.2185, "step": 86 }, { "epoch": 0.03631673974722724, "grad_norm": 1.5266629457473755, "learning_rate": 7.131147540983607e-05, "loss": 3.2058, "step": 88 }, { "epoch": 0.037142120196027854, "grad_norm": 1.36456298828125, "learning_rate": 7.295081967213115e-05, "loss": 3.1753, "step": 90 }, { "epoch": 0.03796750064482848, "grad_norm": 1.3469734191894531, "learning_rate": 7.459016393442624e-05, "loss": 3.1905, "step": 92 }, { "epoch": 0.03879288109362909, "grad_norm": 1.2221981287002563, "learning_rate": 7.622950819672131e-05, "loss": 3.1526, "step": 94 }, { "epoch": 0.039618261542429714, "grad_norm": 1.3852319717407227, "learning_rate": 7.78688524590164e-05, "loss": 3.156, "step": 96 }, { "epoch": 0.04044364199123033, "grad_norm": 1.3925163745880127, "learning_rate": 7.950819672131148e-05, "loss": 3.149, "step": 98 }, { "epoch": 0.04126902244003095, "grad_norm": 1.4574236869812012, "learning_rate": 8.114754098360656e-05, "loss": 3.1252, "step": 100 }, { "epoch": 0.042094402888831574, "grad_norm": 1.1478595733642578, "learning_rate": 8.278688524590165e-05, "loss": 3.1093, "step": 102 }, { "epoch": 0.04291978333763219, "grad_norm": 1.4830694198608398, "learning_rate": 8.442622950819673e-05, "loss": 3.0921, "step": 104 }, { "epoch": 0.04374516378643281, "grad_norm": 1.4275153875350952, "learning_rate": 8.606557377049181e-05, "loss": 3.0861, "step": 106 }, { "epoch": 0.04457054423523343, "grad_norm": 2.0077579021453857, "learning_rate": 8.770491803278689e-05, "loss": 3.0694, "step": 108 }, { "epoch": 0.04539592468403405, "grad_norm": 1.533471703529358, "learning_rate": 8.934426229508197e-05, "loss": 3.0526, "step": 110 }, { "epoch": 0.046221305132834664, "grad_norm": 1.394168496131897, "learning_rate": 9.098360655737706e-05, "loss": 3.0655, "step": 112 }, { "epoch": 0.047046685581635286, "grad_norm": 1.6954408884048462, "learning_rate": 9.262295081967214e-05, "loss": 3.0338, "step": 114 }, { "epoch": 0.0478720660304359, "grad_norm": 1.4712835550308228, "learning_rate": 9.426229508196722e-05, "loss": 3.0511, "step": 116 }, { "epoch": 0.048697446479236524, "grad_norm": 1.473305344581604, "learning_rate": 9.59016393442623e-05, "loss": 3.031, "step": 118 }, { "epoch": 0.04952282692803714, "grad_norm": 1.5957138538360596, "learning_rate": 9.754098360655737e-05, "loss": 3.003, "step": 120 }, { "epoch": 0.05034820737683776, "grad_norm": 1.7283776998519897, "learning_rate": 9.918032786885247e-05, "loss": 3.0025, "step": 122 }, { "epoch": 0.051173587825638384, "grad_norm": 1.286211609840393, "learning_rate": 9.999995343827644e-05, "loss": 3.0046, "step": 124 }, { "epoch": 0.051998968274439, "grad_norm": 1.612631916999817, "learning_rate": 9.99995809450083e-05, "loss": 2.9935, "step": 126 }, { "epoch": 0.05282434872323962, "grad_norm": 1.7912741899490356, "learning_rate": 9.9998835961247e-05, "loss": 3.0016, "step": 128 }, { "epoch": 0.053649729172040236, "grad_norm": 1.7926831245422363, "learning_rate": 9.999771849254263e-05, "loss": 2.9718, "step": 130 }, { "epoch": 0.05447510962084086, "grad_norm": 1.4038861989974976, "learning_rate": 9.999622854722017e-05, "loss": 2.9792, "step": 132 }, { "epoch": 0.055300490069641474, "grad_norm": 1.7067828178405762, "learning_rate": 9.99943661363795e-05, "loss": 2.968, "step": 134 }, { "epoch": 0.056125870518442096, "grad_norm": 1.5349268913269043, "learning_rate": 9.999213127389536e-05, "loss": 2.9373, "step": 136 }, { "epoch": 0.05695125096724271, "grad_norm": 1.2763527631759644, "learning_rate": 9.99895239764172e-05, "loss": 2.9384, "step": 138 }, { "epoch": 0.057776631416043334, "grad_norm": 1.3789610862731934, "learning_rate": 9.998654426336905e-05, "loss": 2.9416, "step": 140 }, { "epoch": 0.05860201186484395, "grad_norm": 1.6175661087036133, "learning_rate": 9.998319215694936e-05, "loss": 2.9323, "step": 142 }, { "epoch": 0.05942739231364457, "grad_norm": 1.5398344993591309, "learning_rate": 9.997946768213095e-05, "loss": 2.9334, "step": 144 }, { "epoch": 0.060252772762445186, "grad_norm": 1.6788642406463623, "learning_rate": 9.997537086666063e-05, "loss": 2.9218, "step": 146 }, { "epoch": 0.06107815321124581, "grad_norm": 1.4843031167984009, "learning_rate": 9.997090174105919e-05, "loss": 2.9366, "step": 148 }, { "epoch": 0.06190353366004643, "grad_norm": 1.3358060121536255, "learning_rate": 9.996606033862102e-05, "loss": 2.9279, "step": 150 }, { "epoch": 0.06272891410884705, "grad_norm": 1.9375636577606201, "learning_rate": 9.996084669541397e-05, "loss": 2.9202, "step": 152 }, { "epoch": 0.06355429455764766, "grad_norm": 1.3849859237670898, "learning_rate": 9.9955260850279e-05, "loss": 2.895, "step": 154 }, { "epoch": 0.06437967500644828, "grad_norm": 1.3628286123275757, "learning_rate": 9.994930284482993e-05, "loss": 2.8983, "step": 156 }, { "epoch": 0.0652050554552489, "grad_norm": 1.2561815977096558, "learning_rate": 9.994297272345319e-05, "loss": 2.9089, "step": 158 }, { "epoch": 0.06603043590404953, "grad_norm": 1.3705800771713257, "learning_rate": 9.993627053330732e-05, "loss": 2.878, "step": 160 }, { "epoch": 0.06685581635285014, "grad_norm": 1.2955900430679321, "learning_rate": 9.99291963243228e-05, "loss": 2.8591, "step": 162 }, { "epoch": 0.06768119680165076, "grad_norm": 1.231101155281067, "learning_rate": 9.992175014920161e-05, "loss": 2.8616, "step": 164 }, { "epoch": 0.06850657725045138, "grad_norm": 1.1412620544433594, "learning_rate": 9.991393206341677e-05, "loss": 2.8353, "step": 166 }, { "epoch": 0.069331957699252, "grad_norm": 1.102623701095581, "learning_rate": 9.990574212521205e-05, "loss": 2.8262, "step": 168 }, { "epoch": 0.07015733814805261, "grad_norm": 1.0235016345977783, "learning_rate": 9.98971803956014e-05, "loss": 2.8581, "step": 170 }, { "epoch": 0.07098271859685323, "grad_norm": 1.475123643875122, "learning_rate": 9.988824693836864e-05, "loss": 2.8709, "step": 172 }, { "epoch": 0.07180809904565386, "grad_norm": 0.8155277371406555, "learning_rate": 9.98789418200669e-05, "loss": 2.8426, "step": 174 }, { "epoch": 0.07263347949445448, "grad_norm": 1.3113749027252197, "learning_rate": 9.98692651100181e-05, "loss": 2.8017, "step": 176 }, { "epoch": 0.0734588599432551, "grad_norm": 1.2450861930847168, "learning_rate": 9.985921688031252e-05, "loss": 2.8317, "step": 178 }, { "epoch": 0.07428424039205571, "grad_norm": 1.304402470588684, "learning_rate": 9.984879720580816e-05, "loss": 2.8157, "step": 180 }, { "epoch": 0.07510962084085633, "grad_norm": 1.1851410865783691, "learning_rate": 9.983800616413026e-05, "loss": 2.8245, "step": 182 }, { "epoch": 0.07593500128965695, "grad_norm": 1.2967396974563599, "learning_rate": 9.982684383567071e-05, "loss": 2.8363, "step": 184 }, { "epoch": 0.07676038173845758, "grad_norm": 1.2011407613754272, "learning_rate": 9.981531030358746e-05, "loss": 2.8142, "step": 186 }, { "epoch": 0.07758576218725818, "grad_norm": 1.0165106058120728, "learning_rate": 9.980340565380382e-05, "loss": 2.7913, "step": 188 }, { "epoch": 0.0784111426360588, "grad_norm": 1.3044579029083252, "learning_rate": 9.979112997500792e-05, "loss": 2.7805, "step": 190 }, { "epoch": 0.07923652308485943, "grad_norm": 1.1849685907363892, "learning_rate": 9.9778483358652e-05, "loss": 2.7707, "step": 192 }, { "epoch": 0.08006190353366005, "grad_norm": 0.9122027158737183, "learning_rate": 9.976546589895175e-05, "loss": 2.7777, "step": 194 }, { "epoch": 0.08088728398246066, "grad_norm": 1.0830117464065552, "learning_rate": 9.975207769288556e-05, "loss": 2.8048, "step": 196 }, { "epoch": 0.08171266443126128, "grad_norm": 1.1544275283813477, "learning_rate": 9.973831884019387e-05, "loss": 2.7761, "step": 198 }, { "epoch": 0.0825380448800619, "grad_norm": 0.8355935215950012, "learning_rate": 9.972418944337835e-05, "loss": 2.7593, "step": 200 }, { "epoch": 0.08336342532886253, "grad_norm": 1.203262209892273, "learning_rate": 9.970968960770124e-05, "loss": 2.7695, "step": 202 }, { "epoch": 0.08418880577766315, "grad_norm": 1.23800790309906, "learning_rate": 9.969481944118443e-05, "loss": 2.7576, "step": 204 }, { "epoch": 0.08501418622646376, "grad_norm": 0.8839966058731079, "learning_rate": 9.96795790546088e-05, "loss": 2.7442, "step": 206 }, { "epoch": 0.08583956667526438, "grad_norm": 0.9399611949920654, "learning_rate": 9.966396856151326e-05, "loss": 2.7402, "step": 208 }, { "epoch": 0.086664947124065, "grad_norm": 1.1721992492675781, "learning_rate": 9.964798807819397e-05, "loss": 2.7378, "step": 210 }, { "epoch": 0.08749032757286562, "grad_norm": 0.9647835493087769, "learning_rate": 9.963163772370352e-05, "loss": 2.7256, "step": 212 }, { "epoch": 0.08831570802166623, "grad_norm": 0.9155466556549072, "learning_rate": 9.961491761984996e-05, "loss": 2.7255, "step": 214 }, { "epoch": 0.08914108847046685, "grad_norm": 0.9373721480369568, "learning_rate": 9.959782789119592e-05, "loss": 2.7544, "step": 216 }, { "epoch": 0.08996646891926748, "grad_norm": 0.9547314643859863, "learning_rate": 9.958036866505772e-05, "loss": 2.7333, "step": 218 }, { "epoch": 0.0907918493680681, "grad_norm": 1.0028138160705566, "learning_rate": 9.956254007150432e-05, "loss": 2.7232, "step": 220 }, { "epoch": 0.0916172298168687, "grad_norm": 1.2652791738510132, "learning_rate": 9.954434224335649e-05, "loss": 2.7268, "step": 222 }, { "epoch": 0.09244261026566933, "grad_norm": 1.1313235759735107, "learning_rate": 9.952577531618574e-05, "loss": 2.7417, "step": 224 }, { "epoch": 0.09326799071446995, "grad_norm": 0.7514833211898804, "learning_rate": 9.950683942831328e-05, "loss": 2.6898, "step": 226 }, { "epoch": 0.09409337116327057, "grad_norm": 0.9731917381286621, "learning_rate": 9.948753472080907e-05, "loss": 2.686, "step": 228 }, { "epoch": 0.0949187516120712, "grad_norm": 0.8640966415405273, "learning_rate": 9.946786133749071e-05, "loss": 2.7168, "step": 230 }, { "epoch": 0.0957441320608718, "grad_norm": 0.9116567969322205, "learning_rate": 9.944781942492242e-05, "loss": 2.7123, "step": 232 }, { "epoch": 0.09656951250967243, "grad_norm": 1.0034291744232178, "learning_rate": 9.942740913241386e-05, "loss": 2.7146, "step": 234 }, { "epoch": 0.09739489295847305, "grad_norm": 0.8208848237991333, "learning_rate": 9.94066306120191e-05, "loss": 2.6773, "step": 236 }, { "epoch": 0.09822027340727367, "grad_norm": 0.8781367540359497, "learning_rate": 9.938548401853547e-05, "loss": 2.719, "step": 238 }, { "epoch": 0.09904565385607428, "grad_norm": 0.7302896976470947, "learning_rate": 9.93639695095024e-05, "loss": 2.7011, "step": 240 }, { "epoch": 0.0998710343048749, "grad_norm": 0.705086350440979, "learning_rate": 9.934208724520024e-05, "loss": 2.6648, "step": 242 }, { "epoch": 0.10069641475367552, "grad_norm": 0.8350553512573242, "learning_rate": 9.931983738864904e-05, "loss": 2.687, "step": 244 }, { "epoch": 0.10152179520247614, "grad_norm": 0.6524394154548645, "learning_rate": 9.92972201056074e-05, "loss": 2.7015, "step": 246 }, { "epoch": 0.10234717565127677, "grad_norm": 0.6503209471702576, "learning_rate": 9.927423556457121e-05, "loss": 2.6148, "step": 248 }, { "epoch": 0.10317255610007738, "grad_norm": 0.7506954073905945, "learning_rate": 9.925088393677236e-05, "loss": 2.6914, "step": 250 }, { "epoch": 0.103997936548878, "grad_norm": 1.1561987400054932, "learning_rate": 9.922716539617746e-05, "loss": 2.6659, "step": 252 }, { "epoch": 0.10482331699767862, "grad_norm": 1.0000964403152466, "learning_rate": 9.920308011948665e-05, "loss": 2.6626, "step": 254 }, { "epoch": 0.10564869744647924, "grad_norm": 0.8899397850036621, "learning_rate": 9.917862828613214e-05, "loss": 2.6666, "step": 256 }, { "epoch": 0.10647407789527985, "grad_norm": 1.1503660678863525, "learning_rate": 9.915381007827698e-05, "loss": 2.6395, "step": 258 }, { "epoch": 0.10729945834408047, "grad_norm": 0.8070819973945618, "learning_rate": 9.912862568081364e-05, "loss": 2.6531, "step": 260 }, { "epoch": 0.1081248387928811, "grad_norm": 0.8623407483100891, "learning_rate": 9.910307528136266e-05, "loss": 2.6588, "step": 262 }, { "epoch": 0.10895021924168172, "grad_norm": 0.9573660492897034, "learning_rate": 9.907715907027129e-05, "loss": 2.6823, "step": 264 }, { "epoch": 0.10977559969048233, "grad_norm": 1.0500940084457397, "learning_rate": 9.905087724061195e-05, "loss": 2.6545, "step": 266 }, { "epoch": 0.11060098013928295, "grad_norm": 1.0520515441894531, "learning_rate": 9.902422998818094e-05, "loss": 2.6371, "step": 268 }, { "epoch": 0.11142636058808357, "grad_norm": 0.9879215955734253, "learning_rate": 9.899721751149688e-05, "loss": 2.6474, "step": 270 }, { "epoch": 0.11225174103688419, "grad_norm": 0.8972532749176025, "learning_rate": 9.896984001179925e-05, "loss": 2.6271, "step": 272 }, { "epoch": 0.11307712148568481, "grad_norm": 0.6369883418083191, "learning_rate": 9.894209769304696e-05, "loss": 2.6054, "step": 274 }, { "epoch": 0.11390250193448542, "grad_norm": 0.6478956937789917, "learning_rate": 9.891399076191674e-05, "loss": 2.6168, "step": 276 }, { "epoch": 0.11472788238328605, "grad_norm": 0.8620642423629761, "learning_rate": 9.888551942780162e-05, "loss": 2.6313, "step": 278 }, { "epoch": 0.11555326283208667, "grad_norm": 0.740717887878418, "learning_rate": 9.885668390280941e-05, "loss": 2.6307, "step": 280 }, { "epoch": 0.11637864328088729, "grad_norm": 0.7513862252235413, "learning_rate": 9.882748440176109e-05, "loss": 2.625, "step": 282 }, { "epoch": 0.1172040237296879, "grad_norm": 0.8409993052482605, "learning_rate": 9.879792114218921e-05, "loss": 2.6034, "step": 284 }, { "epoch": 0.11802940417848852, "grad_norm": 0.8200739622116089, "learning_rate": 9.876799434433628e-05, "loss": 2.599, "step": 286 }, { "epoch": 0.11885478462728914, "grad_norm": 0.9191763401031494, "learning_rate": 9.873770423115314e-05, "loss": 2.6168, "step": 288 }, { "epoch": 0.11968016507608976, "grad_norm": 0.7739763855934143, "learning_rate": 9.870705102829723e-05, "loss": 2.6279, "step": 290 }, { "epoch": 0.12050554552489037, "grad_norm": 0.6580247282981873, "learning_rate": 9.867603496413103e-05, "loss": 2.599, "step": 292 }, { "epoch": 0.121330925973691, "grad_norm": 0.7197789549827576, "learning_rate": 9.864465626972023e-05, "loss": 2.5948, "step": 294 }, { "epoch": 0.12215630642249162, "grad_norm": 0.9027787446975708, "learning_rate": 9.861291517883213e-05, "loss": 2.6058, "step": 296 }, { "epoch": 0.12298168687129224, "grad_norm": 1.048640489578247, "learning_rate": 9.858081192793378e-05, "loss": 2.6128, "step": 298 }, { "epoch": 0.12380706732009286, "grad_norm": 0.827551543712616, "learning_rate": 9.85483467561903e-05, "loss": 2.6058, "step": 300 } ], "logging_steps": 2, "max_steps": 2424, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.377550336196608e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }