{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3435688211294825, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005726147018824708, "grad_norm": 7.761023998260498, "learning_rate": 0.0, "loss": 6.0592, "step": 1 }, { "epoch": 0.0011452294037649416, "grad_norm": 7.8541951179504395, "learning_rate": 5.714285714285715e-07, "loss": 6.0156, "step": 2 }, { "epoch": 0.0022904588075298832, "grad_norm": 7.347611904144287, "learning_rate": 1.7142857142857145e-06, "loss": 6.0103, "step": 4 }, { "epoch": 0.003435688211294825, "grad_norm": 5.382428169250488, "learning_rate": 2.8571428571428573e-06, "loss": 5.9221, "step": 6 }, { "epoch": 0.0045809176150597665, "grad_norm": 5.063406467437744, "learning_rate": 4.000000000000001e-06, "loss": 6.0365, "step": 8 }, { "epoch": 0.005726147018824708, "grad_norm": 9.779157638549805, "learning_rate": 5.142857142857143e-06, "loss": 6.0336, "step": 10 }, { "epoch": 0.00687137642258965, "grad_norm": 7.555446147918701, "learning_rate": 6.285714285714287e-06, "loss": 6.0328, "step": 12 }, { "epoch": 0.008016605826354592, "grad_norm": 6.790043354034424, "learning_rate": 7.428571428571429e-06, "loss": 5.7848, "step": 14 }, { "epoch": 0.009161835230119533, "grad_norm": 4.4132208824157715, "learning_rate": 8.571428571428573e-06, "loss": 5.8207, "step": 16 }, { "epoch": 0.010307064633884476, "grad_norm": 4.064995765686035, "learning_rate": 9.714285714285715e-06, "loss": 5.6497, "step": 18 }, { "epoch": 0.011452294037649417, "grad_norm": 3.357184410095215, "learning_rate": 1.0857142857142858e-05, "loss": 5.7758, "step": 20 }, { "epoch": 0.012597523441414358, "grad_norm": 2.742230176925659, "learning_rate": 1.2e-05, "loss": 5.6173, "step": 22 }, { "epoch": 0.0137427528451793, "grad_norm": 2.491459369659424, "learning_rate": 1.3142857142857143e-05, "loss": 5.6681, "step": 24 }, { "epoch": 0.014887982248944241, "grad_norm": 2.7569029331207275, "learning_rate": 1.4285714285714285e-05, "loss": 5.6393, "step": 26 }, { "epoch": 0.016033211652709184, "grad_norm": 2.208378791809082, "learning_rate": 1.5428571428571428e-05, "loss": 5.5768, "step": 28 }, { "epoch": 0.017178441056474127, "grad_norm": 3.2770133018493652, "learning_rate": 1.657142857142857e-05, "loss": 5.484, "step": 30 }, { "epoch": 0.018323670460239066, "grad_norm": 3.177299976348877, "learning_rate": 1.7714285714285713e-05, "loss": 5.528, "step": 32 }, { "epoch": 0.01946889986400401, "grad_norm": 2.1981537342071533, "learning_rate": 1.885714285714286e-05, "loss": 5.6327, "step": 34 }, { "epoch": 0.02061412926776895, "grad_norm": 3.265881061553955, "learning_rate": 2e-05, "loss": 5.6288, "step": 36 }, { "epoch": 0.02175935867153389, "grad_norm": 3.6059298515319824, "learning_rate": 2.1142857142857144e-05, "loss": 5.4789, "step": 38 }, { "epoch": 0.022904588075298833, "grad_norm": 2.4080026149749756, "learning_rate": 2.2285714285714287e-05, "loss": 5.4046, "step": 40 }, { "epoch": 0.024049817479063776, "grad_norm": 2.142902135848999, "learning_rate": 2.342857142857143e-05, "loss": 5.4738, "step": 42 }, { "epoch": 0.025195046882828715, "grad_norm": 2.4021224975585938, "learning_rate": 2.4571428571428572e-05, "loss": 5.4649, "step": 44 }, { "epoch": 0.026340276286593658, "grad_norm": 2.172009229660034, "learning_rate": 2.5714285714285714e-05, "loss": 5.4302, "step": 46 }, { "epoch": 0.0274855056903586, "grad_norm": 2.9737730026245117, "learning_rate": 2.6857142857142857e-05, "loss": 5.3045, "step": 48 }, { "epoch": 0.028630735094123543, "grad_norm": 3.0378615856170654, "learning_rate": 2.8000000000000003e-05, "loss": 5.2185, "step": 50 }, { "epoch": 0.029775964497888482, "grad_norm": 3.4448676109313965, "learning_rate": 2.9142857142857146e-05, "loss": 5.1838, "step": 52 }, { "epoch": 0.030921193901653425, "grad_norm": 2.469245672225952, "learning_rate": 3.0285714285714288e-05, "loss": 5.1637, "step": 54 }, { "epoch": 0.03206642330541837, "grad_norm": 3.58486008644104, "learning_rate": 3.142857142857143e-05, "loss": 5.2063, "step": 56 }, { "epoch": 0.03321165270918331, "grad_norm": 3.0815446376800537, "learning_rate": 3.257142857142857e-05, "loss": 5.2317, "step": 58 }, { "epoch": 0.03435688211294825, "grad_norm": 3.6842119693756104, "learning_rate": 3.3714285714285716e-05, "loss": 5.2695, "step": 60 }, { "epoch": 0.03550211151671319, "grad_norm": 2.9440791606903076, "learning_rate": 3.485714285714286e-05, "loss": 5.2686, "step": 62 }, { "epoch": 0.03664734092047813, "grad_norm": 3.9632568359375, "learning_rate": 3.6e-05, "loss": 5.1262, "step": 64 }, { "epoch": 0.037792570324243074, "grad_norm": 4.045065402984619, "learning_rate": 3.7142857142857143e-05, "loss": 5.1546, "step": 66 }, { "epoch": 0.03893779972800802, "grad_norm": 3.5707085132598877, "learning_rate": 3.8285714285714286e-05, "loss": 5.0036, "step": 68 }, { "epoch": 0.04008302913177296, "grad_norm": 3.014404535293579, "learning_rate": 3.942857142857143e-05, "loss": 5.026, "step": 70 }, { "epoch": 0.0412282585355379, "grad_norm": 2.708796977996826, "learning_rate": 4.057142857142857e-05, "loss": 4.9442, "step": 72 }, { "epoch": 0.04237348793930284, "grad_norm": 2.5384011268615723, "learning_rate": 4.1714285714285714e-05, "loss": 5.0223, "step": 74 }, { "epoch": 0.04351871734306778, "grad_norm": 3.006281852722168, "learning_rate": 4.2857142857142856e-05, "loss": 4.9827, "step": 76 }, { "epoch": 0.044663946746832724, "grad_norm": 2.5772130489349365, "learning_rate": 4.4000000000000006e-05, "loss": 4.9675, "step": 78 }, { "epoch": 0.045809176150597666, "grad_norm": 3.456017255783081, "learning_rate": 4.514285714285714e-05, "loss": 5.0341, "step": 80 }, { "epoch": 0.04695440555436261, "grad_norm": 3.3163113594055176, "learning_rate": 4.628571428571429e-05, "loss": 4.9867, "step": 82 }, { "epoch": 0.04809963495812755, "grad_norm": 3.7568469047546387, "learning_rate": 4.742857142857143e-05, "loss": 4.8652, "step": 84 }, { "epoch": 0.049244864361892494, "grad_norm": 4.19318151473999, "learning_rate": 4.8571428571428576e-05, "loss": 5.0602, "step": 86 }, { "epoch": 0.05039009376565743, "grad_norm": 5.1034064292907715, "learning_rate": 4.971428571428572e-05, "loss": 4.9757, "step": 88 }, { "epoch": 0.05153532316942237, "grad_norm": 4.0827484130859375, "learning_rate": 5.085714285714286e-05, "loss": 4.8486, "step": 90 }, { "epoch": 0.052680552573187316, "grad_norm": 4.6189446449279785, "learning_rate": 5.2000000000000004e-05, "loss": 4.9595, "step": 92 }, { "epoch": 0.05382578197695226, "grad_norm": 3.988513469696045, "learning_rate": 5.314285714285715e-05, "loss": 4.9035, "step": 94 }, { "epoch": 0.0549710113807172, "grad_norm": 3.857276678085327, "learning_rate": 5.428571428571428e-05, "loss": 4.8277, "step": 96 }, { "epoch": 0.056116240784482144, "grad_norm": 3.5372354984283447, "learning_rate": 5.542857142857143e-05, "loss": 4.7718, "step": 98 }, { "epoch": 0.057261470188247086, "grad_norm": 3.3853676319122314, "learning_rate": 5.6571428571428574e-05, "loss": 4.8098, "step": 100 }, { "epoch": 0.05840669959201202, "grad_norm": 2.1142077445983887, "learning_rate": 5.771428571428572e-05, "loss": 4.7975, "step": 102 }, { "epoch": 0.059551928995776965, "grad_norm": 3.2275538444519043, "learning_rate": 5.885714285714285e-05, "loss": 4.8509, "step": 104 }, { "epoch": 0.06069715839954191, "grad_norm": 3.5413126945495605, "learning_rate": 6e-05, "loss": 4.6069, "step": 106 }, { "epoch": 0.06184238780330685, "grad_norm": 2.755648374557495, "learning_rate": 6.114285714285714e-05, "loss": 4.6951, "step": 108 }, { "epoch": 0.06298761720707179, "grad_norm": 2.980039596557617, "learning_rate": 6.22857142857143e-05, "loss": 4.7012, "step": 110 }, { "epoch": 0.06413284661083674, "grad_norm": 4.890020370483398, "learning_rate": 6.342857142857143e-05, "loss": 4.8008, "step": 112 }, { "epoch": 0.06527807601460167, "grad_norm": 4.35846471786499, "learning_rate": 6.457142857142856e-05, "loss": 4.8587, "step": 114 }, { "epoch": 0.06642330541836662, "grad_norm": 3.6171813011169434, "learning_rate": 6.571428571428571e-05, "loss": 4.7473, "step": 116 }, { "epoch": 0.06756853482213156, "grad_norm": 2.4927010536193848, "learning_rate": 6.685714285714286e-05, "loss": 4.7113, "step": 118 }, { "epoch": 0.0687137642258965, "grad_norm": 3.3327009677886963, "learning_rate": 6.800000000000001e-05, "loss": 4.6105, "step": 120 }, { "epoch": 0.06985899362966144, "grad_norm": 3.1123206615448, "learning_rate": 6.914285714285715e-05, "loss": 4.5968, "step": 122 }, { "epoch": 0.07100422303342638, "grad_norm": 2.6985421180725098, "learning_rate": 7.028571428571428e-05, "loss": 4.6323, "step": 124 }, { "epoch": 0.07214945243719133, "grad_norm": 2.058084011077881, "learning_rate": 7.142857142857143e-05, "loss": 4.5721, "step": 126 }, { "epoch": 0.07329468184095626, "grad_norm": 2.144658327102661, "learning_rate": 7.257142857142858e-05, "loss": 4.6125, "step": 128 }, { "epoch": 0.07443991124472121, "grad_norm": 2.477219820022583, "learning_rate": 7.371428571428572e-05, "loss": 4.4727, "step": 130 }, { "epoch": 0.07558514064848615, "grad_norm": 3.8517298698425293, "learning_rate": 7.485714285714285e-05, "loss": 4.5696, "step": 132 }, { "epoch": 0.0767303700522511, "grad_norm": 3.0253565311431885, "learning_rate": 7.6e-05, "loss": 4.4838, "step": 134 }, { "epoch": 0.07787559945601603, "grad_norm": 3.397569179534912, "learning_rate": 7.714285714285715e-05, "loss": 4.6431, "step": 136 }, { "epoch": 0.07902082885978097, "grad_norm": 2.435197114944458, "learning_rate": 7.828571428571429e-05, "loss": 4.4681, "step": 138 }, { "epoch": 0.08016605826354592, "grad_norm": 2.6476476192474365, "learning_rate": 7.942857142857143e-05, "loss": 4.4462, "step": 140 }, { "epoch": 0.08131128766731086, "grad_norm": 2.1929690837860107, "learning_rate": 8.057142857142857e-05, "loss": 4.5136, "step": 142 }, { "epoch": 0.0824565170710758, "grad_norm": 2.4533395767211914, "learning_rate": 8.171428571428572e-05, "loss": 4.5572, "step": 144 }, { "epoch": 0.08360174647484074, "grad_norm": 2.601806879043579, "learning_rate": 8.285714285714287e-05, "loss": 4.4121, "step": 146 }, { "epoch": 0.08474697587860568, "grad_norm": 3.233973741531372, "learning_rate": 8.4e-05, "loss": 4.4599, "step": 148 }, { "epoch": 0.08589220528237063, "grad_norm": 2.6353538036346436, "learning_rate": 8.514285714285714e-05, "loss": 4.4533, "step": 150 }, { "epoch": 0.08703743468613556, "grad_norm": 2.8465511798858643, "learning_rate": 8.62857142857143e-05, "loss": 4.5246, "step": 152 }, { "epoch": 0.08818266408990051, "grad_norm": 2.8642711639404297, "learning_rate": 8.742857142857144e-05, "loss": 4.4659, "step": 154 }, { "epoch": 0.08932789349366545, "grad_norm": 2.793112277984619, "learning_rate": 8.857142857142857e-05, "loss": 4.5107, "step": 156 }, { "epoch": 0.0904731228974304, "grad_norm": 3.43472957611084, "learning_rate": 8.971428571428571e-05, "loss": 4.4079, "step": 158 }, { "epoch": 0.09161835230119533, "grad_norm": 2.9260294437408447, "learning_rate": 9.085714285714286e-05, "loss": 4.4047, "step": 160 }, { "epoch": 0.09276358170496027, "grad_norm": 2.6336724758148193, "learning_rate": 9.200000000000001e-05, "loss": 4.4777, "step": 162 }, { "epoch": 0.09390881110872522, "grad_norm": 2.8348231315612793, "learning_rate": 9.314285714285715e-05, "loss": 4.3445, "step": 164 }, { "epoch": 0.09505404051249015, "grad_norm": 4.271595478057861, "learning_rate": 9.428571428571429e-05, "loss": 4.4234, "step": 166 }, { "epoch": 0.0961992699162551, "grad_norm": 3.4789109230041504, "learning_rate": 9.542857142857143e-05, "loss": 4.2872, "step": 168 }, { "epoch": 0.09734449932002004, "grad_norm": 2.57273530960083, "learning_rate": 9.657142857142858e-05, "loss": 4.4177, "step": 170 }, { "epoch": 0.09848972872378499, "grad_norm": 2.185086250305176, "learning_rate": 9.771428571428572e-05, "loss": 4.3568, "step": 172 }, { "epoch": 0.09963495812754992, "grad_norm": 2.771744966506958, "learning_rate": 9.885714285714286e-05, "loss": 4.3392, "step": 174 }, { "epoch": 0.10078018753131486, "grad_norm": 1.950353741645813, "learning_rate": 0.0001, "loss": 4.1931, "step": 176 }, { "epoch": 0.10192541693507981, "grad_norm": 2.4709694385528564, "learning_rate": 9.999991040472416e-05, "loss": 4.2936, "step": 178 }, { "epoch": 0.10307064633884475, "grad_norm": 2.140997886657715, "learning_rate": 9.999964161921776e-05, "loss": 4.1653, "step": 180 }, { "epoch": 0.1042158757426097, "grad_norm": 2.491321563720703, "learning_rate": 9.999919364444403e-05, "loss": 4.3202, "step": 182 }, { "epoch": 0.10536110514637463, "grad_norm": 2.5410189628601074, "learning_rate": 9.999856648200845e-05, "loss": 4.2657, "step": 184 }, { "epoch": 0.10650633455013958, "grad_norm": 2.1820590496063232, "learning_rate": 9.999776013415866e-05, "loss": 4.2282, "step": 186 }, { "epoch": 0.10765156395390452, "grad_norm": 1.7251808643341064, "learning_rate": 9.999677460378444e-05, "loss": 4.3421, "step": 188 }, { "epoch": 0.10879679335766945, "grad_norm": 2.002145290374756, "learning_rate": 9.999560989441779e-05, "loss": 4.1361, "step": 190 }, { "epoch": 0.1099420227614344, "grad_norm": 1.9663431644439697, "learning_rate": 9.999426601023274e-05, "loss": 4.201, "step": 192 }, { "epoch": 0.11108725216519934, "grad_norm": 2.1406776905059814, "learning_rate": 9.999274295604558e-05, "loss": 4.1086, "step": 194 }, { "epoch": 0.11223248156896429, "grad_norm": 3.3888607025146484, "learning_rate": 9.999104073731458e-05, "loss": 4.2723, "step": 196 }, { "epoch": 0.11337771097272922, "grad_norm": 2.371840715408325, "learning_rate": 9.998915936014024e-05, "loss": 4.1893, "step": 198 }, { "epoch": 0.11452294037649417, "grad_norm": 2.0502302646636963, "learning_rate": 9.998709883126502e-05, "loss": 4.1395, "step": 200 }, { "epoch": 0.11566816978025911, "grad_norm": 1.6674678325653076, "learning_rate": 9.998485915807347e-05, "loss": 4.071, "step": 202 }, { "epoch": 0.11681339918402404, "grad_norm": 1.7829004526138306, "learning_rate": 9.998244034859219e-05, "loss": 4.1107, "step": 204 }, { "epoch": 0.117958628587789, "grad_norm": 1.763493299484253, "learning_rate": 9.997984241148967e-05, "loss": 4.1142, "step": 206 }, { "epoch": 0.11910385799155393, "grad_norm": 2.069258213043213, "learning_rate": 9.997706535607649e-05, "loss": 4.047, "step": 208 }, { "epoch": 0.12024908739531888, "grad_norm": 2.4262139797210693, "learning_rate": 9.997410919230505e-05, "loss": 4.0396, "step": 210 }, { "epoch": 0.12139431679908382, "grad_norm": 1.820494532585144, "learning_rate": 9.997097393076971e-05, "loss": 4.1548, "step": 212 }, { "epoch": 0.12253954620284876, "grad_norm": 2.1332643032073975, "learning_rate": 9.996765958270664e-05, "loss": 4.1384, "step": 214 }, { "epoch": 0.1236847756066137, "grad_norm": 2.1329920291900635, "learning_rate": 9.996416615999384e-05, "loss": 4.0315, "step": 216 }, { "epoch": 0.12483000501037864, "grad_norm": 2.29955792427063, "learning_rate": 9.996049367515108e-05, "loss": 4.0963, "step": 218 }, { "epoch": 0.12597523441414357, "grad_norm": 2.225827693939209, "learning_rate": 9.995664214133983e-05, "loss": 4.1247, "step": 220 }, { "epoch": 0.12712046381790854, "grad_norm": 1.794838786125183, "learning_rate": 9.99526115723633e-05, "loss": 4.0449, "step": 222 }, { "epoch": 0.12826569322167347, "grad_norm": 1.7548491954803467, "learning_rate": 9.994840198266626e-05, "loss": 3.927, "step": 224 }, { "epoch": 0.1294109226254384, "grad_norm": 1.487001895904541, "learning_rate": 9.994401338733508e-05, "loss": 3.9714, "step": 226 }, { "epoch": 0.13055615202920334, "grad_norm": 1.9811242818832397, "learning_rate": 9.993944580209768e-05, "loss": 4.0094, "step": 228 }, { "epoch": 0.13170138143296828, "grad_norm": 1.4257248640060425, "learning_rate": 9.99346992433234e-05, "loss": 4.0213, "step": 230 }, { "epoch": 0.13284661083673324, "grad_norm": 1.545812726020813, "learning_rate": 9.992977372802302e-05, "loss": 4.0076, "step": 232 }, { "epoch": 0.13399184024049818, "grad_norm": 1.8193179368972778, "learning_rate": 9.992466927384865e-05, "loss": 4.0536, "step": 234 }, { "epoch": 0.1351370696442631, "grad_norm": 2.329951763153076, "learning_rate": 9.991938589909369e-05, "loss": 3.9284, "step": 236 }, { "epoch": 0.13628229904802805, "grad_norm": 1.928336501121521, "learning_rate": 9.991392362269276e-05, "loss": 3.9462, "step": 238 }, { "epoch": 0.137427528451793, "grad_norm": 1.4073456525802612, "learning_rate": 9.990828246422164e-05, "loss": 3.9525, "step": 240 }, { "epoch": 0.13857275785555795, "grad_norm": 1.6663973331451416, "learning_rate": 9.990246244389713e-05, "loss": 3.9685, "step": 242 }, { "epoch": 0.13971798725932288, "grad_norm": 1.8091737031936646, "learning_rate": 9.989646358257715e-05, "loss": 3.9284, "step": 244 }, { "epoch": 0.14086321666308782, "grad_norm": 1.5511283874511719, "learning_rate": 9.989028590176044e-05, "loss": 3.9289, "step": 246 }, { "epoch": 0.14200844606685276, "grad_norm": 1.5394625663757324, "learning_rate": 9.988392942358664e-05, "loss": 3.9849, "step": 248 }, { "epoch": 0.14315367547061772, "grad_norm": 1.680882453918457, "learning_rate": 9.98773941708362e-05, "loss": 3.9452, "step": 250 }, { "epoch": 0.14429890487438266, "grad_norm": 1.6341670751571655, "learning_rate": 9.98706801669302e-05, "loss": 3.8317, "step": 252 }, { "epoch": 0.1454441342781476, "grad_norm": 1.9933757781982422, "learning_rate": 9.986378743593036e-05, "loss": 3.9665, "step": 254 }, { "epoch": 0.14658936368191253, "grad_norm": 2.2253994941711426, "learning_rate": 9.985671600253894e-05, "loss": 3.9239, "step": 256 }, { "epoch": 0.14773459308567746, "grad_norm": 2.2543365955352783, "learning_rate": 9.984946589209862e-05, "loss": 3.8639, "step": 258 }, { "epoch": 0.14887982248944243, "grad_norm": 1.8106629848480225, "learning_rate": 9.984203713059241e-05, "loss": 3.9178, "step": 260 }, { "epoch": 0.15002505189320736, "grad_norm": 1.638542652130127, "learning_rate": 9.983442974464362e-05, "loss": 3.9169, "step": 262 }, { "epoch": 0.1511702812969723, "grad_norm": 1.3521384000778198, "learning_rate": 9.982664376151564e-05, "loss": 3.8682, "step": 264 }, { "epoch": 0.15231551070073723, "grad_norm": 1.6458699703216553, "learning_rate": 9.981867920911201e-05, "loss": 3.9566, "step": 266 }, { "epoch": 0.1534607401045022, "grad_norm": 1.7851066589355469, "learning_rate": 9.981053611597615e-05, "loss": 3.9085, "step": 268 }, { "epoch": 0.15460596950826713, "grad_norm": 1.6740517616271973, "learning_rate": 9.980221451129137e-05, "loss": 3.8899, "step": 270 }, { "epoch": 0.15575119891203207, "grad_norm": 1.117129921913147, "learning_rate": 9.979371442488073e-05, "loss": 3.7544, "step": 272 }, { "epoch": 0.156896428315797, "grad_norm": 1.5676058530807495, "learning_rate": 9.978503588720694e-05, "loss": 3.7753, "step": 274 }, { "epoch": 0.15804165771956194, "grad_norm": 1.6609163284301758, "learning_rate": 9.977617892937223e-05, "loss": 3.8463, "step": 276 }, { "epoch": 0.1591868871233269, "grad_norm": 1.7229987382888794, "learning_rate": 9.976714358311828e-05, "loss": 3.8446, "step": 278 }, { "epoch": 0.16033211652709184, "grad_norm": 1.6770962476730347, "learning_rate": 9.975792988082603e-05, "loss": 3.8684, "step": 280 }, { "epoch": 0.16147734593085677, "grad_norm": 1.215281367301941, "learning_rate": 9.974853785551568e-05, "loss": 3.7788, "step": 282 }, { "epoch": 0.1626225753346217, "grad_norm": 1.208257794380188, "learning_rate": 9.973896754084646e-05, "loss": 3.8338, "step": 284 }, { "epoch": 0.16376780473838665, "grad_norm": 1.4068255424499512, "learning_rate": 9.972921897111658e-05, "loss": 3.8583, "step": 286 }, { "epoch": 0.1649130341421516, "grad_norm": 1.4898021221160889, "learning_rate": 9.971929218126306e-05, "loss": 3.8051, "step": 288 }, { "epoch": 0.16605826354591655, "grad_norm": 1.6303211450576782, "learning_rate": 9.970918720686164e-05, "loss": 3.8598, "step": 290 }, { "epoch": 0.16720349294968148, "grad_norm": 1.6599496603012085, "learning_rate": 9.969890408412665e-05, "loss": 3.7214, "step": 292 }, { "epoch": 0.16834872235344642, "grad_norm": 1.1958950757980347, "learning_rate": 9.968844284991086e-05, "loss": 3.7042, "step": 294 }, { "epoch": 0.16949395175721135, "grad_norm": 1.3099420070648193, "learning_rate": 9.967780354170533e-05, "loss": 3.7405, "step": 296 }, { "epoch": 0.17063918116097632, "grad_norm": 1.5054072141647339, "learning_rate": 9.966698619763936e-05, "loss": 3.7827, "step": 298 }, { "epoch": 0.17178441056474125, "grad_norm": 1.444757103919983, "learning_rate": 9.965599085648025e-05, "loss": 3.7361, "step": 300 }, { "epoch": 0.1729296399685062, "grad_norm": 0.9423370361328125, "learning_rate": 9.964481755763322e-05, "loss": 3.7063, "step": 302 }, { "epoch": 0.17407486937227112, "grad_norm": 1.044169306755066, "learning_rate": 9.963346634114128e-05, "loss": 3.7999, "step": 304 }, { "epoch": 0.1752200987760361, "grad_norm": 1.578296184539795, "learning_rate": 9.962193724768503e-05, "loss": 3.7448, "step": 306 }, { "epoch": 0.17636532817980102, "grad_norm": 1.4953491687774658, "learning_rate": 9.961023031858258e-05, "loss": 3.7625, "step": 308 }, { "epoch": 0.17751055758356596, "grad_norm": 1.295817494392395, "learning_rate": 9.959834559578934e-05, "loss": 3.7042, "step": 310 }, { "epoch": 0.1786557869873309, "grad_norm": 1.4001609086990356, "learning_rate": 9.95862831218979e-05, "loss": 3.7272, "step": 312 }, { "epoch": 0.17980101639109583, "grad_norm": 1.8881722688674927, "learning_rate": 9.95740429401379e-05, "loss": 3.6904, "step": 314 }, { "epoch": 0.1809462457948608, "grad_norm": 1.919791340827942, "learning_rate": 9.956162509437584e-05, "loss": 3.7071, "step": 316 }, { "epoch": 0.18209147519862573, "grad_norm": 1.758253574371338, "learning_rate": 9.954902962911494e-05, "loss": 3.7906, "step": 318 }, { "epoch": 0.18323670460239067, "grad_norm": 1.480323314666748, "learning_rate": 9.953625658949494e-05, "loss": 3.7697, "step": 320 }, { "epoch": 0.1843819340061556, "grad_norm": 1.5573948621749878, "learning_rate": 9.952330602129202e-05, "loss": 3.752, "step": 322 }, { "epoch": 0.18552716340992054, "grad_norm": 1.3204878568649292, "learning_rate": 9.951017797091858e-05, "loss": 3.6479, "step": 324 }, { "epoch": 0.1866723928136855, "grad_norm": 1.5514147281646729, "learning_rate": 9.949687248542303e-05, "loss": 3.7199, "step": 326 }, { "epoch": 0.18781762221745044, "grad_norm": 1.2910770177841187, "learning_rate": 9.948338961248977e-05, "loss": 3.7427, "step": 328 }, { "epoch": 0.18896285162121537, "grad_norm": 1.1663178205490112, "learning_rate": 9.946972940043882e-05, "loss": 3.6616, "step": 330 }, { "epoch": 0.1901080810249803, "grad_norm": 1.3439650535583496, "learning_rate": 9.945589189822584e-05, "loss": 3.7385, "step": 332 }, { "epoch": 0.19125331042874527, "grad_norm": 1.1256877183914185, "learning_rate": 9.94418771554418e-05, "loss": 3.6056, "step": 334 }, { "epoch": 0.1923985398325102, "grad_norm": 1.1813896894454956, "learning_rate": 9.942768522231289e-05, "loss": 3.5544, "step": 336 }, { "epoch": 0.19354376923627514, "grad_norm": 1.2541157007217407, "learning_rate": 9.941331614970031e-05, "loss": 3.6401, "step": 338 }, { "epoch": 0.19468899864004008, "grad_norm": 1.237069010734558, "learning_rate": 9.939876998910012e-05, "loss": 3.7564, "step": 340 }, { "epoch": 0.19583422804380501, "grad_norm": 1.1157530546188354, "learning_rate": 9.938404679264301e-05, "loss": 3.6164, "step": 342 }, { "epoch": 0.19697945744756998, "grad_norm": 1.149465560913086, "learning_rate": 9.936914661309412e-05, "loss": 3.6968, "step": 344 }, { "epoch": 0.1981246868513349, "grad_norm": 0.9530683755874634, "learning_rate": 9.93540695038529e-05, "loss": 3.6194, "step": 346 }, { "epoch": 0.19926991625509985, "grad_norm": 1.1686296463012695, "learning_rate": 9.933881551895281e-05, "loss": 3.7604, "step": 348 }, { "epoch": 0.20041514565886479, "grad_norm": 1.2699095010757446, "learning_rate": 9.93233847130613e-05, "loss": 3.6371, "step": 350 }, { "epoch": 0.20156037506262972, "grad_norm": 1.1345208883285522, "learning_rate": 9.930777714147945e-05, "loss": 3.6146, "step": 352 }, { "epoch": 0.20270560446639468, "grad_norm": 1.3319895267486572, "learning_rate": 9.929199286014185e-05, "loss": 3.6443, "step": 354 }, { "epoch": 0.20385083387015962, "grad_norm": 1.6053088903427124, "learning_rate": 9.927603192561637e-05, "loss": 3.6277, "step": 356 }, { "epoch": 0.20499606327392456, "grad_norm": 1.2149386405944824, "learning_rate": 9.925989439510398e-05, "loss": 3.5555, "step": 358 }, { "epoch": 0.2061412926776895, "grad_norm": 1.0859287977218628, "learning_rate": 9.924358032643855e-05, "loss": 3.6253, "step": 360 }, { "epoch": 0.20728652208145446, "grad_norm": 0.9613994359970093, "learning_rate": 9.922708977808663e-05, "loss": 3.5826, "step": 362 }, { "epoch": 0.2084317514852194, "grad_norm": 1.0509222745895386, "learning_rate": 9.921042280914721e-05, "loss": 3.6263, "step": 364 }, { "epoch": 0.20957698088898433, "grad_norm": 1.3777049779891968, "learning_rate": 9.919357947935156e-05, "loss": 3.6187, "step": 366 }, { "epoch": 0.21072221029274926, "grad_norm": 1.3364644050598145, "learning_rate": 9.9176559849063e-05, "loss": 3.5946, "step": 368 }, { "epoch": 0.2118674396965142, "grad_norm": 1.4562104940414429, "learning_rate": 9.915936397927665e-05, "loss": 3.6099, "step": 370 }, { "epoch": 0.21301266910027916, "grad_norm": 1.066383719444275, "learning_rate": 9.91419919316193e-05, "loss": 3.5395, "step": 372 }, { "epoch": 0.2141578985040441, "grad_norm": 1.6498733758926392, "learning_rate": 9.912444376834903e-05, "loss": 3.6083, "step": 374 }, { "epoch": 0.21530312790780903, "grad_norm": 0.9828553795814514, "learning_rate": 9.910671955235518e-05, "loss": 3.5409, "step": 376 }, { "epoch": 0.21644835731157397, "grad_norm": 1.178269624710083, "learning_rate": 9.908881934715798e-05, "loss": 3.6018, "step": 378 }, { "epoch": 0.2175935867153389, "grad_norm": 1.3328818082809448, "learning_rate": 9.907074321690838e-05, "loss": 3.5718, "step": 380 }, { "epoch": 0.21873881611910387, "grad_norm": 1.1077896356582642, "learning_rate": 9.905249122638783e-05, "loss": 3.581, "step": 382 }, { "epoch": 0.2198840455228688, "grad_norm": 1.220638394355774, "learning_rate": 9.903406344100798e-05, "loss": 3.5813, "step": 384 }, { "epoch": 0.22102927492663374, "grad_norm": 1.5574766397476196, "learning_rate": 9.901545992681057e-05, "loss": 3.5785, "step": 386 }, { "epoch": 0.22217450433039868, "grad_norm": 1.013902187347412, "learning_rate": 9.899668075046706e-05, "loss": 3.6156, "step": 388 }, { "epoch": 0.2233197337341636, "grad_norm": 1.197936773300171, "learning_rate": 9.897772597927848e-05, "loss": 3.5428, "step": 390 }, { "epoch": 0.22446496313792857, "grad_norm": 0.9838180541992188, "learning_rate": 9.895859568117512e-05, "loss": 3.534, "step": 392 }, { "epoch": 0.2256101925416935, "grad_norm": 1.0316840410232544, "learning_rate": 9.893928992471639e-05, "loss": 3.5691, "step": 394 }, { "epoch": 0.22675542194545845, "grad_norm": 0.9378739595413208, "learning_rate": 9.891980877909045e-05, "loss": 3.5368, "step": 396 }, { "epoch": 0.22790065134922338, "grad_norm": 1.4947346448898315, "learning_rate": 9.890015231411404e-05, "loss": 3.5709, "step": 398 }, { "epoch": 0.22904588075298835, "grad_norm": 0.9118148684501648, "learning_rate": 9.888032060023225e-05, "loss": 3.527, "step": 400 }, { "epoch": 0.23019111015675328, "grad_norm": 1.2407753467559814, "learning_rate": 9.886031370851816e-05, "loss": 3.5301, "step": 402 }, { "epoch": 0.23133633956051822, "grad_norm": 1.7163093090057373, "learning_rate": 9.88401317106727e-05, "loss": 3.5828, "step": 404 }, { "epoch": 0.23248156896428315, "grad_norm": 1.0757009983062744, "learning_rate": 9.881977467902434e-05, "loss": 3.4831, "step": 406 }, { "epoch": 0.2336267983680481, "grad_norm": 0.9473862648010254, "learning_rate": 9.879924268652885e-05, "loss": 3.5196, "step": 408 }, { "epoch": 0.23477202777181305, "grad_norm": 1.199771761894226, "learning_rate": 9.877853580676897e-05, "loss": 3.574, "step": 410 }, { "epoch": 0.235917257175578, "grad_norm": 0.9006698131561279, "learning_rate": 9.875765411395428e-05, "loss": 3.5348, "step": 412 }, { "epoch": 0.23706248657934292, "grad_norm": 1.1242282390594482, "learning_rate": 9.873659768292081e-05, "loss": 3.5249, "step": 414 }, { "epoch": 0.23820771598310786, "grad_norm": 1.0675747394561768, "learning_rate": 9.871536658913082e-05, "loss": 3.5086, "step": 416 }, { "epoch": 0.2393529453868728, "grad_norm": 0.8544116616249084, "learning_rate": 9.869396090867255e-05, "loss": 3.546, "step": 418 }, { "epoch": 0.24049817479063776, "grad_norm": 1.3136742115020752, "learning_rate": 9.867238071825992e-05, "loss": 3.4937, "step": 420 }, { "epoch": 0.2416434041944027, "grad_norm": 1.3740772008895874, "learning_rate": 9.865062609523223e-05, "loss": 3.4303, "step": 422 }, { "epoch": 0.24278863359816763, "grad_norm": 1.342213749885559, "learning_rate": 9.862869711755397e-05, "loss": 3.4982, "step": 424 }, { "epoch": 0.24393386300193257, "grad_norm": 1.0677942037582397, "learning_rate": 9.860659386381443e-05, "loss": 3.4288, "step": 426 }, { "epoch": 0.24507909240569753, "grad_norm": 0.9615838527679443, "learning_rate": 9.858431641322749e-05, "loss": 3.4787, "step": 428 }, { "epoch": 0.24622432180946247, "grad_norm": 1.0572890043258667, "learning_rate": 9.856186484563134e-05, "loss": 3.5314, "step": 430 }, { "epoch": 0.2473695512132274, "grad_norm": 1.158275842666626, "learning_rate": 9.853923924148815e-05, "loss": 3.5504, "step": 432 }, { "epoch": 0.24851478061699234, "grad_norm": 1.171581745147705, "learning_rate": 9.851643968188383e-05, "loss": 3.5478, "step": 434 }, { "epoch": 0.24966001002075727, "grad_norm": 1.0333714485168457, "learning_rate": 9.849346624852764e-05, "loss": 3.5497, "step": 436 }, { "epoch": 0.2508052394245222, "grad_norm": 0.9459155797958374, "learning_rate": 9.847031902375207e-05, "loss": 3.5074, "step": 438 }, { "epoch": 0.25195046882828714, "grad_norm": 1.0424790382385254, "learning_rate": 9.84469980905124e-05, "loss": 3.4961, "step": 440 }, { "epoch": 0.25309569823205214, "grad_norm": 1.0463571548461914, "learning_rate": 9.842350353238642e-05, "loss": 3.4405, "step": 442 }, { "epoch": 0.25424092763581707, "grad_norm": 1.000319242477417, "learning_rate": 9.839983543357421e-05, "loss": 3.4595, "step": 444 }, { "epoch": 0.255386157039582, "grad_norm": 1.2526150941848755, "learning_rate": 9.837599387889773e-05, "loss": 3.5012, "step": 446 }, { "epoch": 0.25653138644334694, "grad_norm": 1.3148843050003052, "learning_rate": 9.835197895380065e-05, "loss": 3.4767, "step": 448 }, { "epoch": 0.2576766158471119, "grad_norm": 1.3939634561538696, "learning_rate": 9.83277907443479e-05, "loss": 3.3783, "step": 450 }, { "epoch": 0.2588218452508768, "grad_norm": 1.0367929935455322, "learning_rate": 9.830342933722545e-05, "loss": 3.4289, "step": 452 }, { "epoch": 0.25996707465464175, "grad_norm": 0.9439120888710022, "learning_rate": 9.827889481974e-05, "loss": 3.4728, "step": 454 }, { "epoch": 0.2611123040584067, "grad_norm": 1.2146074771881104, "learning_rate": 9.82541872798186e-05, "loss": 3.4257, "step": 456 }, { "epoch": 0.2622575334621716, "grad_norm": 1.0530729293823242, "learning_rate": 9.822930680600841e-05, "loss": 3.4681, "step": 458 }, { "epoch": 0.26340276286593656, "grad_norm": 1.1026678085327148, "learning_rate": 9.820425348747637e-05, "loss": 3.4298, "step": 460 }, { "epoch": 0.26454799226970155, "grad_norm": 1.2520779371261597, "learning_rate": 9.817902741400879e-05, "loss": 3.4191, "step": 462 }, { "epoch": 0.2656932216734665, "grad_norm": 1.1041593551635742, "learning_rate": 9.815362867601121e-05, "loss": 3.466, "step": 464 }, { "epoch": 0.2668384510772314, "grad_norm": 0.881693422794342, "learning_rate": 9.812805736450786e-05, "loss": 3.4929, "step": 466 }, { "epoch": 0.26798368048099636, "grad_norm": 1.3125033378601074, "learning_rate": 9.810231357114152e-05, "loss": 3.4592, "step": 468 }, { "epoch": 0.2691289098847613, "grad_norm": 1.2968268394470215, "learning_rate": 9.807639738817307e-05, "loss": 3.4851, "step": 470 }, { "epoch": 0.2702741392885262, "grad_norm": 0.9855544567108154, "learning_rate": 9.805030890848119e-05, "loss": 3.4487, "step": 472 }, { "epoch": 0.27141936869229116, "grad_norm": 1.3063323497772217, "learning_rate": 9.802404822556209e-05, "loss": 3.4961, "step": 474 }, { "epoch": 0.2725645980960561, "grad_norm": 1.0567957162857056, "learning_rate": 9.79976154335291e-05, "loss": 3.3975, "step": 476 }, { "epoch": 0.27370982749982103, "grad_norm": 0.9473979473114014, "learning_rate": 9.797101062711231e-05, "loss": 3.4573, "step": 478 }, { "epoch": 0.274855056903586, "grad_norm": 1.2931294441223145, "learning_rate": 9.794423390165837e-05, "loss": 3.3732, "step": 480 }, { "epoch": 0.27600028630735096, "grad_norm": 1.233302116394043, "learning_rate": 9.791728535312998e-05, "loss": 3.419, "step": 482 }, { "epoch": 0.2771455157111159, "grad_norm": 0.9638918042182922, "learning_rate": 9.789016507810564e-05, "loss": 3.4119, "step": 484 }, { "epoch": 0.27829074511488083, "grad_norm": 1.105643391609192, "learning_rate": 9.786287317377929e-05, "loss": 3.3909, "step": 486 }, { "epoch": 0.27943597451864577, "grad_norm": 0.9666796922683716, "learning_rate": 9.783540973795998e-05, "loss": 3.4194, "step": 488 }, { "epoch": 0.2805812039224107, "grad_norm": 1.3533586263656616, "learning_rate": 9.780777486907146e-05, "loss": 3.3789, "step": 490 }, { "epoch": 0.28172643332617564, "grad_norm": 1.1253416538238525, "learning_rate": 9.777996866615186e-05, "loss": 3.4385, "step": 492 }, { "epoch": 0.2828716627299406, "grad_norm": 0.7198868989944458, "learning_rate": 9.775199122885339e-05, "loss": 3.4038, "step": 494 }, { "epoch": 0.2840168921337055, "grad_norm": 0.9696770310401917, "learning_rate": 9.772384265744188e-05, "loss": 3.4576, "step": 496 }, { "epoch": 0.28516212153747045, "grad_norm": 1.321269154548645, "learning_rate": 9.76955230527965e-05, "loss": 3.4348, "step": 498 }, { "epoch": 0.28630735094123544, "grad_norm": 1.3119802474975586, "learning_rate": 9.766703251640934e-05, "loss": 3.3848, "step": 500 }, { "epoch": 0.2874525803450004, "grad_norm": 1.0199967622756958, "learning_rate": 9.763837115038513e-05, "loss": 3.4108, "step": 502 }, { "epoch": 0.2885978097487653, "grad_norm": 0.9925194382667542, "learning_rate": 9.760953905744075e-05, "loss": 3.31, "step": 504 }, { "epoch": 0.28974303915253025, "grad_norm": 0.9447107315063477, "learning_rate": 9.758053634090502e-05, "loss": 3.3598, "step": 506 }, { "epoch": 0.2908882685562952, "grad_norm": 1.052873134613037, "learning_rate": 9.755136310471817e-05, "loss": 3.3704, "step": 508 }, { "epoch": 0.2920334979600601, "grad_norm": 1.061514139175415, "learning_rate": 9.752201945343156e-05, "loss": 3.3642, "step": 510 }, { "epoch": 0.29317872736382505, "grad_norm": 0.8627074956893921, "learning_rate": 9.74925054922073e-05, "loss": 3.367, "step": 512 }, { "epoch": 0.29432395676759, "grad_norm": 1.0214530229568481, "learning_rate": 9.746282132681785e-05, "loss": 3.3266, "step": 514 }, { "epoch": 0.2954691861713549, "grad_norm": 1.1223275661468506, "learning_rate": 9.743296706364565e-05, "loss": 3.4194, "step": 516 }, { "epoch": 0.2966144155751199, "grad_norm": 0.9849138259887695, "learning_rate": 9.740294280968273e-05, "loss": 3.3664, "step": 518 }, { "epoch": 0.29775964497888485, "grad_norm": 0.7025099396705627, "learning_rate": 9.737274867253034e-05, "loss": 3.3772, "step": 520 }, { "epoch": 0.2989048743826498, "grad_norm": 0.936536967754364, "learning_rate": 9.734238476039858e-05, "loss": 3.3196, "step": 522 }, { "epoch": 0.3000501037864147, "grad_norm": 1.113277792930603, "learning_rate": 9.731185118210598e-05, "loss": 3.4606, "step": 524 }, { "epoch": 0.30119533319017966, "grad_norm": 1.0153186321258545, "learning_rate": 9.728114804707909e-05, "loss": 3.4079, "step": 526 }, { "epoch": 0.3023405625939446, "grad_norm": 1.1675206422805786, "learning_rate": 9.725027546535215e-05, "loss": 3.4111, "step": 528 }, { "epoch": 0.30348579199770953, "grad_norm": 0.9518959522247314, "learning_rate": 9.721923354756665e-05, "loss": 3.3905, "step": 530 }, { "epoch": 0.30463102140147447, "grad_norm": 0.9693425297737122, "learning_rate": 9.718802240497098e-05, "loss": 3.4364, "step": 532 }, { "epoch": 0.3057762508052394, "grad_norm": 1.1249076128005981, "learning_rate": 9.715664214941997e-05, "loss": 3.3373, "step": 534 }, { "epoch": 0.3069214802090044, "grad_norm": 0.8406875133514404, "learning_rate": 9.712509289337453e-05, "loss": 3.321, "step": 536 }, { "epoch": 0.30806670961276933, "grad_norm": 0.9538395404815674, "learning_rate": 9.709337474990121e-05, "loss": 3.4007, "step": 538 }, { "epoch": 0.30921193901653427, "grad_norm": 0.8003599047660828, "learning_rate": 9.706148783267187e-05, "loss": 3.3798, "step": 540 }, { "epoch": 0.3103571684202992, "grad_norm": 0.8605026602745056, "learning_rate": 9.702943225596316e-05, "loss": 3.2908, "step": 542 }, { "epoch": 0.31150239782406414, "grad_norm": 0.7349815964698792, "learning_rate": 9.699720813465625e-05, "loss": 3.408, "step": 544 }, { "epoch": 0.3126476272278291, "grad_norm": 1.1622780561447144, "learning_rate": 9.696481558423628e-05, "loss": 3.3212, "step": 546 }, { "epoch": 0.313792856631594, "grad_norm": 0.9829496145248413, "learning_rate": 9.693225472079204e-05, "loss": 3.4067, "step": 548 }, { "epoch": 0.31493808603535894, "grad_norm": 1.1378313302993774, "learning_rate": 9.689952566101548e-05, "loss": 3.3556, "step": 550 }, { "epoch": 0.3160833154391239, "grad_norm": 0.9355561137199402, "learning_rate": 9.686662852220142e-05, "loss": 3.3281, "step": 552 }, { "epoch": 0.3172285448428888, "grad_norm": 0.9328277111053467, "learning_rate": 9.683356342224694e-05, "loss": 3.313, "step": 554 }, { "epoch": 0.3183737742466538, "grad_norm": 1.277377724647522, "learning_rate": 9.680033047965114e-05, "loss": 3.3499, "step": 556 }, { "epoch": 0.31951900365041874, "grad_norm": 1.0239235162734985, "learning_rate": 9.67669298135146e-05, "loss": 3.3936, "step": 558 }, { "epoch": 0.3206642330541837, "grad_norm": 0.6908963322639465, "learning_rate": 9.673336154353899e-05, "loss": 3.3584, "step": 560 }, { "epoch": 0.3218094624579486, "grad_norm": 0.8835290670394897, "learning_rate": 9.669962579002664e-05, "loss": 3.3728, "step": 562 }, { "epoch": 0.32295469186171355, "grad_norm": 1.0561710596084595, "learning_rate": 9.666572267388013e-05, "loss": 3.3579, "step": 564 }, { "epoch": 0.3240999212654785, "grad_norm": 0.8400120735168457, "learning_rate": 9.663165231660181e-05, "loss": 3.3224, "step": 566 }, { "epoch": 0.3252451506692434, "grad_norm": 0.8960584998130798, "learning_rate": 9.659741484029341e-05, "loss": 3.3434, "step": 568 }, { "epoch": 0.32639038007300836, "grad_norm": 0.9615944027900696, "learning_rate": 9.656301036765558e-05, "loss": 3.2587, "step": 570 }, { "epoch": 0.3275356094767733, "grad_norm": 0.983391523361206, "learning_rate": 9.652843902198743e-05, "loss": 3.2396, "step": 572 }, { "epoch": 0.3286808388805383, "grad_norm": 0.7758197784423828, "learning_rate": 9.649370092718615e-05, "loss": 3.2948, "step": 574 }, { "epoch": 0.3298260682843032, "grad_norm": 0.9714862704277039, "learning_rate": 9.64587962077465e-05, "loss": 3.3381, "step": 576 }, { "epoch": 0.33097129768806816, "grad_norm": 0.8628116846084595, "learning_rate": 9.64237249887604e-05, "loss": 3.294, "step": 578 }, { "epoch": 0.3321165270918331, "grad_norm": 0.9794777035713196, "learning_rate": 9.638848739591646e-05, "loss": 3.3119, "step": 580 }, { "epoch": 0.333261756495598, "grad_norm": 0.8179820775985718, "learning_rate": 9.635308355549957e-05, "loss": 3.3009, "step": 582 }, { "epoch": 0.33440698589936296, "grad_norm": 0.8732323050498962, "learning_rate": 9.63175135943904e-05, "loss": 3.3207, "step": 584 }, { "epoch": 0.3355522153031279, "grad_norm": 1.0355788469314575, "learning_rate": 9.628177764006497e-05, "loss": 3.2889, "step": 586 }, { "epoch": 0.33669744470689283, "grad_norm": 0.8974720239639282, "learning_rate": 9.624587582059417e-05, "loss": 3.3089, "step": 588 }, { "epoch": 0.33784267411065777, "grad_norm": 0.7800531387329102, "learning_rate": 9.620980826464335e-05, "loss": 3.2999, "step": 590 }, { "epoch": 0.3389879035144227, "grad_norm": 0.7294676899909973, "learning_rate": 9.617357510147182e-05, "loss": 3.3634, "step": 592 }, { "epoch": 0.3401331329181877, "grad_norm": 0.7799131274223328, "learning_rate": 9.613717646093239e-05, "loss": 3.308, "step": 594 }, { "epoch": 0.34127836232195263, "grad_norm": 0.9899328947067261, "learning_rate": 9.610061247347091e-05, "loss": 3.3191, "step": 596 }, { "epoch": 0.34242359172571757, "grad_norm": 1.0520347356796265, "learning_rate": 9.606388327012579e-05, "loss": 3.389, "step": 598 }, { "epoch": 0.3435688211294825, "grad_norm": 0.9768466353416443, "learning_rate": 9.602698898252756e-05, "loss": 3.2905, "step": 600 } ], "logging_steps": 2, "max_steps": 3494, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.755100672393216e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }