| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.17178441056474125, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005726147018824708, | |
| "grad_norm": 7.761023998260498, | |
| "learning_rate": 0.0, | |
| "loss": 6.0592, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0011452294037649416, | |
| "grad_norm": 7.8541951179504395, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": 6.0156, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0022904588075298832, | |
| "grad_norm": 7.347611904144287, | |
| "learning_rate": 1.7142857142857145e-06, | |
| "loss": 6.0103, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.003435688211294825, | |
| "grad_norm": 5.382428169250488, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 5.9221, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0045809176150597665, | |
| "grad_norm": 5.063406467437744, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 6.0365, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.005726147018824708, | |
| "grad_norm": 9.779157638549805, | |
| "learning_rate": 5.142857142857143e-06, | |
| "loss": 6.0336, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00687137642258965, | |
| "grad_norm": 7.555446147918701, | |
| "learning_rate": 6.285714285714287e-06, | |
| "loss": 6.0328, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.008016605826354592, | |
| "grad_norm": 6.790043354034424, | |
| "learning_rate": 7.428571428571429e-06, | |
| "loss": 5.7848, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.009161835230119533, | |
| "grad_norm": 4.4132208824157715, | |
| "learning_rate": 8.571428571428573e-06, | |
| "loss": 5.8207, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.010307064633884476, | |
| "grad_norm": 4.064995765686035, | |
| "learning_rate": 9.714285714285715e-06, | |
| "loss": 5.6497, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.011452294037649417, | |
| "grad_norm": 3.357184410095215, | |
| "learning_rate": 1.0857142857142858e-05, | |
| "loss": 5.7758, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012597523441414358, | |
| "grad_norm": 2.742230176925659, | |
| "learning_rate": 1.2e-05, | |
| "loss": 5.6173, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0137427528451793, | |
| "grad_norm": 2.491459369659424, | |
| "learning_rate": 1.3142857142857143e-05, | |
| "loss": 5.6681, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.014887982248944241, | |
| "grad_norm": 2.7569029331207275, | |
| "learning_rate": 1.4285714285714285e-05, | |
| "loss": 5.6393, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.016033211652709184, | |
| "grad_norm": 2.208378791809082, | |
| "learning_rate": 1.5428571428571428e-05, | |
| "loss": 5.5768, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.017178441056474127, | |
| "grad_norm": 3.2770133018493652, | |
| "learning_rate": 1.657142857142857e-05, | |
| "loss": 5.484, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.018323670460239066, | |
| "grad_norm": 3.177299976348877, | |
| "learning_rate": 1.7714285714285713e-05, | |
| "loss": 5.528, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.01946889986400401, | |
| "grad_norm": 2.1981537342071533, | |
| "learning_rate": 1.885714285714286e-05, | |
| "loss": 5.6327, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.02061412926776895, | |
| "grad_norm": 3.265881061553955, | |
| "learning_rate": 2e-05, | |
| "loss": 5.6288, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.02175935867153389, | |
| "grad_norm": 3.6059298515319824, | |
| "learning_rate": 2.1142857142857144e-05, | |
| "loss": 5.4789, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.022904588075298833, | |
| "grad_norm": 2.4080026149749756, | |
| "learning_rate": 2.2285714285714287e-05, | |
| "loss": 5.4046, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.024049817479063776, | |
| "grad_norm": 2.142902135848999, | |
| "learning_rate": 2.342857142857143e-05, | |
| "loss": 5.4738, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.025195046882828715, | |
| "grad_norm": 2.4021224975585938, | |
| "learning_rate": 2.4571428571428572e-05, | |
| "loss": 5.4649, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.026340276286593658, | |
| "grad_norm": 2.172009229660034, | |
| "learning_rate": 2.5714285714285714e-05, | |
| "loss": 5.4302, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0274855056903586, | |
| "grad_norm": 2.9737730026245117, | |
| "learning_rate": 2.6857142857142857e-05, | |
| "loss": 5.3045, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.028630735094123543, | |
| "grad_norm": 3.0378615856170654, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 5.2185, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029775964497888482, | |
| "grad_norm": 3.4448676109313965, | |
| "learning_rate": 2.9142857142857146e-05, | |
| "loss": 5.1838, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.030921193901653425, | |
| "grad_norm": 2.469245672225952, | |
| "learning_rate": 3.0285714285714288e-05, | |
| "loss": 5.1637, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.03206642330541837, | |
| "grad_norm": 3.58486008644104, | |
| "learning_rate": 3.142857142857143e-05, | |
| "loss": 5.2063, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.03321165270918331, | |
| "grad_norm": 3.0815446376800537, | |
| "learning_rate": 3.257142857142857e-05, | |
| "loss": 5.2317, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.03435688211294825, | |
| "grad_norm": 3.6842119693756104, | |
| "learning_rate": 3.3714285714285716e-05, | |
| "loss": 5.2695, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03550211151671319, | |
| "grad_norm": 2.9440791606903076, | |
| "learning_rate": 3.485714285714286e-05, | |
| "loss": 5.2686, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.03664734092047813, | |
| "grad_norm": 3.9632568359375, | |
| "learning_rate": 3.6e-05, | |
| "loss": 5.1262, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.037792570324243074, | |
| "grad_norm": 4.045065402984619, | |
| "learning_rate": 3.7142857142857143e-05, | |
| "loss": 5.1546, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.03893779972800802, | |
| "grad_norm": 3.5707085132598877, | |
| "learning_rate": 3.8285714285714286e-05, | |
| "loss": 5.0036, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.04008302913177296, | |
| "grad_norm": 3.014404535293579, | |
| "learning_rate": 3.942857142857143e-05, | |
| "loss": 5.026, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0412282585355379, | |
| "grad_norm": 2.708796977996826, | |
| "learning_rate": 4.057142857142857e-05, | |
| "loss": 4.9442, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.04237348793930284, | |
| "grad_norm": 2.5384011268615723, | |
| "learning_rate": 4.1714285714285714e-05, | |
| "loss": 5.0223, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.04351871734306778, | |
| "grad_norm": 3.006281852722168, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 4.9827, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.044663946746832724, | |
| "grad_norm": 2.5772130489349365, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 4.9675, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.045809176150597666, | |
| "grad_norm": 3.456017255783081, | |
| "learning_rate": 4.514285714285714e-05, | |
| "loss": 5.0341, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04695440555436261, | |
| "grad_norm": 3.3163113594055176, | |
| "learning_rate": 4.628571428571429e-05, | |
| "loss": 4.9867, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.04809963495812755, | |
| "grad_norm": 3.7568469047546387, | |
| "learning_rate": 4.742857142857143e-05, | |
| "loss": 4.8652, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.049244864361892494, | |
| "grad_norm": 4.19318151473999, | |
| "learning_rate": 4.8571428571428576e-05, | |
| "loss": 5.0602, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.05039009376565743, | |
| "grad_norm": 5.1034064292907715, | |
| "learning_rate": 4.971428571428572e-05, | |
| "loss": 4.9757, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.05153532316942237, | |
| "grad_norm": 4.0827484130859375, | |
| "learning_rate": 5.085714285714286e-05, | |
| "loss": 4.8486, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.052680552573187316, | |
| "grad_norm": 4.6189446449279785, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 4.9595, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.05382578197695226, | |
| "grad_norm": 3.988513469696045, | |
| "learning_rate": 5.314285714285715e-05, | |
| "loss": 4.9035, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.0549710113807172, | |
| "grad_norm": 3.857276678085327, | |
| "learning_rate": 5.428571428571428e-05, | |
| "loss": 4.8277, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.056116240784482144, | |
| "grad_norm": 3.5372354984283447, | |
| "learning_rate": 5.542857142857143e-05, | |
| "loss": 4.7718, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.057261470188247086, | |
| "grad_norm": 3.3853676319122314, | |
| "learning_rate": 5.6571428571428574e-05, | |
| "loss": 4.8098, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05840669959201202, | |
| "grad_norm": 2.1142077445983887, | |
| "learning_rate": 5.771428571428572e-05, | |
| "loss": 4.7975, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.059551928995776965, | |
| "grad_norm": 3.2275538444519043, | |
| "learning_rate": 5.885714285714285e-05, | |
| "loss": 4.8509, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.06069715839954191, | |
| "grad_norm": 3.5413126945495605, | |
| "learning_rate": 6e-05, | |
| "loss": 4.6069, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.06184238780330685, | |
| "grad_norm": 2.755648374557495, | |
| "learning_rate": 6.114285714285714e-05, | |
| "loss": 4.6951, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.06298761720707179, | |
| "grad_norm": 2.980039596557617, | |
| "learning_rate": 6.22857142857143e-05, | |
| "loss": 4.7012, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06413284661083674, | |
| "grad_norm": 4.890020370483398, | |
| "learning_rate": 6.342857142857143e-05, | |
| "loss": 4.8008, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.06527807601460167, | |
| "grad_norm": 4.35846471786499, | |
| "learning_rate": 6.457142857142856e-05, | |
| "loss": 4.8587, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.06642330541836662, | |
| "grad_norm": 3.6171813011169434, | |
| "learning_rate": 6.571428571428571e-05, | |
| "loss": 4.7473, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.06756853482213156, | |
| "grad_norm": 2.4927010536193848, | |
| "learning_rate": 6.685714285714286e-05, | |
| "loss": 4.7113, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.0687137642258965, | |
| "grad_norm": 3.3327009677886963, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 4.6105, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06985899362966144, | |
| "grad_norm": 3.1123206615448, | |
| "learning_rate": 6.914285714285715e-05, | |
| "loss": 4.5968, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.07100422303342638, | |
| "grad_norm": 2.6985421180725098, | |
| "learning_rate": 7.028571428571428e-05, | |
| "loss": 4.6323, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.07214945243719133, | |
| "grad_norm": 2.058084011077881, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 4.5721, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.07329468184095626, | |
| "grad_norm": 2.144658327102661, | |
| "learning_rate": 7.257142857142858e-05, | |
| "loss": 4.6125, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.07443991124472121, | |
| "grad_norm": 2.477219820022583, | |
| "learning_rate": 7.371428571428572e-05, | |
| "loss": 4.4727, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07558514064848615, | |
| "grad_norm": 3.8517298698425293, | |
| "learning_rate": 7.485714285714285e-05, | |
| "loss": 4.5696, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.0767303700522511, | |
| "grad_norm": 3.0253565311431885, | |
| "learning_rate": 7.6e-05, | |
| "loss": 4.4838, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.07787559945601603, | |
| "grad_norm": 3.397569179534912, | |
| "learning_rate": 7.714285714285715e-05, | |
| "loss": 4.6431, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.07902082885978097, | |
| "grad_norm": 2.435197114944458, | |
| "learning_rate": 7.828571428571429e-05, | |
| "loss": 4.4681, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.08016605826354592, | |
| "grad_norm": 2.6476476192474365, | |
| "learning_rate": 7.942857142857143e-05, | |
| "loss": 4.4462, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08131128766731086, | |
| "grad_norm": 2.1929690837860107, | |
| "learning_rate": 8.057142857142857e-05, | |
| "loss": 4.5136, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.0824565170710758, | |
| "grad_norm": 2.4533395767211914, | |
| "learning_rate": 8.171428571428572e-05, | |
| "loss": 4.5572, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.08360174647484074, | |
| "grad_norm": 2.601806879043579, | |
| "learning_rate": 8.285714285714287e-05, | |
| "loss": 4.4121, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.08474697587860568, | |
| "grad_norm": 3.233973741531372, | |
| "learning_rate": 8.4e-05, | |
| "loss": 4.4599, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.08589220528237063, | |
| "grad_norm": 2.6353538036346436, | |
| "learning_rate": 8.514285714285714e-05, | |
| "loss": 4.4533, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08703743468613556, | |
| "grad_norm": 2.8465511798858643, | |
| "learning_rate": 8.62857142857143e-05, | |
| "loss": 4.5246, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.08818266408990051, | |
| "grad_norm": 2.8642711639404297, | |
| "learning_rate": 8.742857142857144e-05, | |
| "loss": 4.4659, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.08932789349366545, | |
| "grad_norm": 2.793112277984619, | |
| "learning_rate": 8.857142857142857e-05, | |
| "loss": 4.5107, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.0904731228974304, | |
| "grad_norm": 3.43472957611084, | |
| "learning_rate": 8.971428571428571e-05, | |
| "loss": 4.4079, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.09161835230119533, | |
| "grad_norm": 2.9260294437408447, | |
| "learning_rate": 9.085714285714286e-05, | |
| "loss": 4.4047, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09276358170496027, | |
| "grad_norm": 2.6336724758148193, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 4.4777, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.09390881110872522, | |
| "grad_norm": 2.8348231315612793, | |
| "learning_rate": 9.314285714285715e-05, | |
| "loss": 4.3445, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.09505404051249015, | |
| "grad_norm": 4.271595478057861, | |
| "learning_rate": 9.428571428571429e-05, | |
| "loss": 4.4234, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.0961992699162551, | |
| "grad_norm": 3.4789109230041504, | |
| "learning_rate": 9.542857142857143e-05, | |
| "loss": 4.2872, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.09734449932002004, | |
| "grad_norm": 2.57273530960083, | |
| "learning_rate": 9.657142857142858e-05, | |
| "loss": 4.4177, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09848972872378499, | |
| "grad_norm": 2.185086250305176, | |
| "learning_rate": 9.771428571428572e-05, | |
| "loss": 4.3568, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.09963495812754992, | |
| "grad_norm": 2.771744966506958, | |
| "learning_rate": 9.885714285714286e-05, | |
| "loss": 4.3392, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.10078018753131486, | |
| "grad_norm": 1.950353741645813, | |
| "learning_rate": 0.0001, | |
| "loss": 4.1931, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.10192541693507981, | |
| "grad_norm": 2.4709694385528564, | |
| "learning_rate": 9.999991040472416e-05, | |
| "loss": 4.2936, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.10307064633884475, | |
| "grad_norm": 2.140997886657715, | |
| "learning_rate": 9.999964161921776e-05, | |
| "loss": 4.1653, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1042158757426097, | |
| "grad_norm": 2.491321563720703, | |
| "learning_rate": 9.999919364444403e-05, | |
| "loss": 4.3202, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.10536110514637463, | |
| "grad_norm": 2.5410189628601074, | |
| "learning_rate": 9.999856648200845e-05, | |
| "loss": 4.2657, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.10650633455013958, | |
| "grad_norm": 2.1820590496063232, | |
| "learning_rate": 9.999776013415866e-05, | |
| "loss": 4.2282, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.10765156395390452, | |
| "grad_norm": 1.7251808643341064, | |
| "learning_rate": 9.999677460378444e-05, | |
| "loss": 4.3421, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.10879679335766945, | |
| "grad_norm": 2.002145290374756, | |
| "learning_rate": 9.999560989441779e-05, | |
| "loss": 4.1361, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1099420227614344, | |
| "grad_norm": 1.9663431644439697, | |
| "learning_rate": 9.999426601023274e-05, | |
| "loss": 4.201, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.11108725216519934, | |
| "grad_norm": 2.1406776905059814, | |
| "learning_rate": 9.999274295604558e-05, | |
| "loss": 4.1086, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.11223248156896429, | |
| "grad_norm": 3.3888607025146484, | |
| "learning_rate": 9.999104073731458e-05, | |
| "loss": 4.2723, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.11337771097272922, | |
| "grad_norm": 2.371840715408325, | |
| "learning_rate": 9.998915936014024e-05, | |
| "loss": 4.1893, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.11452294037649417, | |
| "grad_norm": 2.0502302646636963, | |
| "learning_rate": 9.998709883126502e-05, | |
| "loss": 4.1395, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11566816978025911, | |
| "grad_norm": 1.6674678325653076, | |
| "learning_rate": 9.998485915807347e-05, | |
| "loss": 4.071, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.11681339918402404, | |
| "grad_norm": 1.7829004526138306, | |
| "learning_rate": 9.998244034859219e-05, | |
| "loss": 4.1107, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.117958628587789, | |
| "grad_norm": 1.763493299484253, | |
| "learning_rate": 9.997984241148967e-05, | |
| "loss": 4.1142, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.11910385799155393, | |
| "grad_norm": 2.069258213043213, | |
| "learning_rate": 9.997706535607649e-05, | |
| "loss": 4.047, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.12024908739531888, | |
| "grad_norm": 2.4262139797210693, | |
| "learning_rate": 9.997410919230505e-05, | |
| "loss": 4.0396, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12139431679908382, | |
| "grad_norm": 1.820494532585144, | |
| "learning_rate": 9.997097393076971e-05, | |
| "loss": 4.1548, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.12253954620284876, | |
| "grad_norm": 2.1332643032073975, | |
| "learning_rate": 9.996765958270664e-05, | |
| "loss": 4.1384, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.1236847756066137, | |
| "grad_norm": 2.1329920291900635, | |
| "learning_rate": 9.996416615999384e-05, | |
| "loss": 4.0315, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.12483000501037864, | |
| "grad_norm": 2.29955792427063, | |
| "learning_rate": 9.996049367515108e-05, | |
| "loss": 4.0963, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.12597523441414357, | |
| "grad_norm": 2.225827693939209, | |
| "learning_rate": 9.995664214133983e-05, | |
| "loss": 4.1247, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12712046381790854, | |
| "grad_norm": 1.794838786125183, | |
| "learning_rate": 9.99526115723633e-05, | |
| "loss": 4.0449, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.12826569322167347, | |
| "grad_norm": 1.7548491954803467, | |
| "learning_rate": 9.994840198266626e-05, | |
| "loss": 3.927, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.1294109226254384, | |
| "grad_norm": 1.487001895904541, | |
| "learning_rate": 9.994401338733508e-05, | |
| "loss": 3.9714, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.13055615202920334, | |
| "grad_norm": 1.9811242818832397, | |
| "learning_rate": 9.993944580209768e-05, | |
| "loss": 4.0094, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.13170138143296828, | |
| "grad_norm": 1.4257248640060425, | |
| "learning_rate": 9.99346992433234e-05, | |
| "loss": 4.0213, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13284661083673324, | |
| "grad_norm": 1.545812726020813, | |
| "learning_rate": 9.992977372802302e-05, | |
| "loss": 4.0076, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.13399184024049818, | |
| "grad_norm": 1.8193179368972778, | |
| "learning_rate": 9.992466927384865e-05, | |
| "loss": 4.0536, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.1351370696442631, | |
| "grad_norm": 2.329951763153076, | |
| "learning_rate": 9.991938589909369e-05, | |
| "loss": 3.9284, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.13628229904802805, | |
| "grad_norm": 1.928336501121521, | |
| "learning_rate": 9.991392362269276e-05, | |
| "loss": 3.9462, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.137427528451793, | |
| "grad_norm": 1.4073456525802612, | |
| "learning_rate": 9.990828246422164e-05, | |
| "loss": 3.9525, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13857275785555795, | |
| "grad_norm": 1.6663973331451416, | |
| "learning_rate": 9.990246244389713e-05, | |
| "loss": 3.9685, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.13971798725932288, | |
| "grad_norm": 1.8091737031936646, | |
| "learning_rate": 9.989646358257715e-05, | |
| "loss": 3.9284, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.14086321666308782, | |
| "grad_norm": 1.5511283874511719, | |
| "learning_rate": 9.989028590176044e-05, | |
| "loss": 3.9289, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.14200844606685276, | |
| "grad_norm": 1.5394625663757324, | |
| "learning_rate": 9.988392942358664e-05, | |
| "loss": 3.9849, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.14315367547061772, | |
| "grad_norm": 1.680882453918457, | |
| "learning_rate": 9.98773941708362e-05, | |
| "loss": 3.9452, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14429890487438266, | |
| "grad_norm": 1.6341670751571655, | |
| "learning_rate": 9.98706801669302e-05, | |
| "loss": 3.8317, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.1454441342781476, | |
| "grad_norm": 1.9933757781982422, | |
| "learning_rate": 9.986378743593036e-05, | |
| "loss": 3.9665, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.14658936368191253, | |
| "grad_norm": 2.2253994941711426, | |
| "learning_rate": 9.985671600253894e-05, | |
| "loss": 3.9239, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.14773459308567746, | |
| "grad_norm": 2.2543365955352783, | |
| "learning_rate": 9.984946589209862e-05, | |
| "loss": 3.8639, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.14887982248944243, | |
| "grad_norm": 1.8106629848480225, | |
| "learning_rate": 9.984203713059241e-05, | |
| "loss": 3.9178, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.15002505189320736, | |
| "grad_norm": 1.638542652130127, | |
| "learning_rate": 9.983442974464362e-05, | |
| "loss": 3.9169, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.1511702812969723, | |
| "grad_norm": 1.3521384000778198, | |
| "learning_rate": 9.982664376151564e-05, | |
| "loss": 3.8682, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.15231551070073723, | |
| "grad_norm": 1.6458699703216553, | |
| "learning_rate": 9.981867920911201e-05, | |
| "loss": 3.9566, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.1534607401045022, | |
| "grad_norm": 1.7851066589355469, | |
| "learning_rate": 9.981053611597615e-05, | |
| "loss": 3.9085, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.15460596950826713, | |
| "grad_norm": 1.6740517616271973, | |
| "learning_rate": 9.980221451129137e-05, | |
| "loss": 3.8899, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15575119891203207, | |
| "grad_norm": 1.117129921913147, | |
| "learning_rate": 9.979371442488073e-05, | |
| "loss": 3.7544, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.156896428315797, | |
| "grad_norm": 1.5676058530807495, | |
| "learning_rate": 9.978503588720694e-05, | |
| "loss": 3.7753, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.15804165771956194, | |
| "grad_norm": 1.6609163284301758, | |
| "learning_rate": 9.977617892937223e-05, | |
| "loss": 3.8463, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.1591868871233269, | |
| "grad_norm": 1.7229987382888794, | |
| "learning_rate": 9.976714358311828e-05, | |
| "loss": 3.8446, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.16033211652709184, | |
| "grad_norm": 1.6770962476730347, | |
| "learning_rate": 9.975792988082603e-05, | |
| "loss": 3.8684, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16147734593085677, | |
| "grad_norm": 1.215281367301941, | |
| "learning_rate": 9.974853785551568e-05, | |
| "loss": 3.7788, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.1626225753346217, | |
| "grad_norm": 1.208257794380188, | |
| "learning_rate": 9.973896754084646e-05, | |
| "loss": 3.8338, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.16376780473838665, | |
| "grad_norm": 1.4068255424499512, | |
| "learning_rate": 9.972921897111658e-05, | |
| "loss": 3.8583, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.1649130341421516, | |
| "grad_norm": 1.4898021221160889, | |
| "learning_rate": 9.971929218126306e-05, | |
| "loss": 3.8051, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.16605826354591655, | |
| "grad_norm": 1.6303211450576782, | |
| "learning_rate": 9.970918720686164e-05, | |
| "loss": 3.8598, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16720349294968148, | |
| "grad_norm": 1.6599496603012085, | |
| "learning_rate": 9.969890408412665e-05, | |
| "loss": 3.7214, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.16834872235344642, | |
| "grad_norm": 1.1958950757980347, | |
| "learning_rate": 9.968844284991086e-05, | |
| "loss": 3.7042, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.16949395175721135, | |
| "grad_norm": 1.3099420070648193, | |
| "learning_rate": 9.967780354170533e-05, | |
| "loss": 3.7405, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.17063918116097632, | |
| "grad_norm": 1.5054072141647339, | |
| "learning_rate": 9.966698619763936e-05, | |
| "loss": 3.7827, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.17178441056474125, | |
| "grad_norm": 1.444757103919983, | |
| "learning_rate": 9.965599085648025e-05, | |
| "loss": 3.7361, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 3494, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.377550336196608e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |