{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 25, "global_step": 1092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027472527472527472, "grad_norm": 0.30766117572784424, "learning_rate": 1.8181818181818182e-05, "loss": 1.3096, "step": 10 }, { "epoch": 0.054945054945054944, "grad_norm": 0.27449363470077515, "learning_rate": 3.6363636363636364e-05, "loss": 1.3065, "step": 20 }, { "epoch": 0.06868131868131869, "eval_loss": 1.2722554206848145, "eval_runtime": 120.0918, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 25 }, { "epoch": 0.08241758241758242, "grad_norm": 0.29537826776504517, "learning_rate": 5.4545454545454546e-05, "loss": 1.2378, "step": 30 }, { "epoch": 0.10989010989010989, "grad_norm": 0.25553378462791443, "learning_rate": 7.272727272727273e-05, "loss": 1.1986, "step": 40 }, { "epoch": 0.13736263736263737, "grad_norm": 0.27968353033065796, "learning_rate": 9.090909090909092e-05, "loss": 1.165, "step": 50 }, { "epoch": 0.13736263736263737, "eval_loss": 1.1781517267227173, "eval_runtime": 120.0866, "eval_samples_per_second": 43.102, "eval_steps_per_second": 2.698, "step": 50 }, { "epoch": 0.16483516483516483, "grad_norm": 0.3039194643497467, "learning_rate": 0.00010909090909090909, "loss": 1.1649, "step": 60 }, { "epoch": 0.19230769230769232, "grad_norm": 0.2939436435699463, "learning_rate": 0.00012727272727272728, "loss": 1.1199, "step": 70 }, { "epoch": 0.20604395604395603, "eval_loss": 1.1345776319503784, "eval_runtime": 120.0914, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 75 }, { "epoch": 0.21978021978021978, "grad_norm": 0.31279808282852173, "learning_rate": 0.00014545454545454546, "loss": 1.1184, "step": 80 }, { "epoch": 0.24725274725274726, "grad_norm": 0.3539753556251526, "learning_rate": 0.00016363636363636366, "loss": 1.0936, "step": 90 }, { "epoch": 0.27472527472527475, "grad_norm": 0.32932248711586, "learning_rate": 0.00018181818181818183, "loss": 1.0818, "step": 100 }, { "epoch": 0.27472527472527475, "eval_loss": 1.112221598625183, "eval_runtime": 120.0873, "eval_samples_per_second": 43.102, "eval_steps_per_second": 2.698, "step": 100 }, { "epoch": 0.3021978021978022, "grad_norm": 0.3215370178222656, "learning_rate": 0.0002, "loss": 1.0894, "step": 110 }, { "epoch": 0.32967032967032966, "grad_norm": 0.3059958517551422, "learning_rate": 0.00019994883066969053, "loss": 1.0814, "step": 120 }, { "epoch": 0.3434065934065934, "eval_loss": 1.0942074060440063, "eval_runtime": 120.096, "eval_samples_per_second": 43.099, "eval_steps_per_second": 2.698, "step": 125 }, { "epoch": 0.35714285714285715, "grad_norm": 0.3109903037548065, "learning_rate": 0.00019979537504476944, "loss": 1.0766, "step": 130 }, { "epoch": 0.38461538461538464, "grad_norm": 0.32679346203804016, "learning_rate": 0.00019953979016966788, "loss": 1.0663, "step": 140 }, { "epoch": 0.41208791208791207, "grad_norm": 0.32413366436958313, "learning_rate": 0.0001991823376065238, "loss": 1.0614, "step": 150 }, { "epoch": 0.41208791208791207, "eval_loss": 1.0805188417434692, "eval_runtime": 120.1022, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 150 }, { "epoch": 0.43956043956043955, "grad_norm": 0.27476972341537476, "learning_rate": 0.00019872338316750265, "loss": 1.0531, "step": 160 }, { "epoch": 0.46703296703296704, "grad_norm": 0.2732694447040558, "learning_rate": 0.00019816339654043022, "loss": 1.0552, "step": 170 }, { "epoch": 0.4807692307692308, "eval_loss": 1.0678906440734863, "eval_runtime": 120.1049, "eval_samples_per_second": 43.096, "eval_steps_per_second": 2.698, "step": 175 }, { "epoch": 0.4945054945054945, "grad_norm": 0.31261274218559265, "learning_rate": 0.00019750295080812023, "loss": 1.0707, "step": 180 }, { "epoch": 0.521978021978022, "grad_norm": 0.31733250617980957, "learning_rate": 0.0001967427218618893, "loss": 1.0363, "step": 190 }, { "epoch": 0.5494505494505495, "grad_norm": 0.31759801506996155, "learning_rate": 0.0001958834877098586, "loss": 1.0519, "step": 200 }, { "epoch": 0.5494505494505495, "eval_loss": 1.0573759078979492, "eval_runtime": 120.091, "eval_samples_per_second": 43.101, "eval_steps_per_second": 2.698, "step": 200 }, { "epoch": 0.5769230769230769, "grad_norm": 0.2911220192909241, "learning_rate": 0.00019492612768075092, "loss": 1.0559, "step": 210 }, { "epoch": 0.6043956043956044, "grad_norm": 0.2815748155117035, "learning_rate": 0.0001938716215239974, "loss": 1.0568, "step": 220 }, { "epoch": 0.6181318681318682, "eval_loss": 1.046871304512024, "eval_runtime": 120.0941, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 225 }, { "epoch": 0.6318681318681318, "grad_norm": 0.2947179079055786, "learning_rate": 0.00019272104840707487, "loss": 1.0414, "step": 230 }, { "epoch": 0.6593406593406593, "grad_norm": 0.2895919680595398, "learning_rate": 0.00019147558581110078, "loss": 1.0345, "step": 240 }, { "epoch": 0.6868131868131868, "grad_norm": 0.2915903329849243, "learning_rate": 0.00019013650832581423, "loss": 1.0361, "step": 250 }, { "epoch": 0.6868131868131868, "eval_loss": 1.0375587940216064, "eval_runtime": 120.0823, "eval_samples_per_second": 43.104, "eval_steps_per_second": 2.698, "step": 250 }, { "epoch": 0.7142857142857143, "grad_norm": 0.30468592047691345, "learning_rate": 0.0001887051863451784, "loss": 1.0385, "step": 260 }, { "epoch": 0.7417582417582418, "grad_norm": 0.3248215913772583, "learning_rate": 0.00018718308466493744, "loss": 1.0153, "step": 270 }, { "epoch": 0.7554945054945055, "eval_loss": 1.0293300151824951, "eval_runtime": 120.0939, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 275 }, { "epoch": 0.7692307692307693, "grad_norm": 0.29462930560112, "learning_rate": 0.00018557176098356405, "loss": 1.0185, "step": 280 }, { "epoch": 0.7967032967032966, "grad_norm": 0.2817062437534332, "learning_rate": 0.00018387286430813208, "loss": 1.0088, "step": 290 }, { "epoch": 0.8241758241758241, "grad_norm": 0.3126353621482849, "learning_rate": 0.00018208813326674444, "loss": 1.0128, "step": 300 }, { "epoch": 0.8241758241758241, "eval_loss": 1.020519733428955, "eval_runtime": 120.0922, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 300 }, { "epoch": 0.8516483516483516, "grad_norm": 0.31813979148864746, "learning_rate": 0.00018021939432924454, "loss": 1.0061, "step": 310 }, { "epoch": 0.8791208791208791, "grad_norm": 0.3094867467880249, "learning_rate": 0.00017826855993803147, "loss": 1.0064, "step": 320 }, { "epoch": 0.8928571428571429, "eval_loss": 1.0121577978134155, "eval_runtime": 120.1095, "eval_samples_per_second": 43.094, "eval_steps_per_second": 2.698, "step": 325 }, { "epoch": 0.9065934065934066, "grad_norm": 0.3104652464389801, "learning_rate": 0.00017623762655089207, "loss": 0.988, "step": 330 }, { "epoch": 0.9340659340659341, "grad_norm": 0.2933966815471649, "learning_rate": 0.00017412867259785286, "loss": 1.0015, "step": 340 }, { "epoch": 0.9615384615384616, "grad_norm": 0.31546550989151, "learning_rate": 0.00017194385635414244, "loss": 0.995, "step": 350 }, { "epoch": 0.9615384615384616, "eval_loss": 1.0048952102661133, "eval_runtime": 120.0899, "eval_samples_per_second": 43.101, "eval_steps_per_second": 2.698, "step": 350 }, { "epoch": 0.989010989010989, "grad_norm": 0.3239256739616394, "learning_rate": 0.00016968541373144156, "loss": 0.9934, "step": 360 }, { "epoch": 1.0164835164835164, "grad_norm": 0.3171522617340088, "learning_rate": 0.00016735565598968114, "loss": 0.9443, "step": 370 }, { "epoch": 1.0302197802197801, "eval_loss": 0.9992949962615967, "eval_runtime": 120.0895, "eval_samples_per_second": 43.101, "eval_steps_per_second": 2.698, "step": 375 }, { "epoch": 1.043956043956044, "grad_norm": 0.32471972703933716, "learning_rate": 0.0001649569673717298, "loss": 0.9499, "step": 380 }, { "epoch": 1.0714285714285714, "grad_norm": 0.3332098126411438, "learning_rate": 0.0001624918026633916, "loss": 0.9218, "step": 390 }, { "epoch": 1.098901098901099, "grad_norm": 0.30731338262557983, "learning_rate": 0.00015996268468121102, "loss": 0.9393, "step": 400 }, { "epoch": 1.098901098901099, "eval_loss": 0.9930510520935059, "eval_runtime": 120.092, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 400 }, { "epoch": 1.1263736263736264, "grad_norm": 0.3216601014137268, "learning_rate": 0.00015737220169065655, "loss": 0.9317, "step": 410 }, { "epoch": 1.1538461538461537, "grad_norm": 0.3047005534172058, "learning_rate": 0.00015472300475732426, "loss": 0.9223, "step": 420 }, { "epoch": 1.1675824175824177, "eval_loss": 0.9878760576248169, "eval_runtime": 120.095, "eval_samples_per_second": 43.099, "eval_steps_per_second": 2.698, "step": 425 }, { "epoch": 1.1813186813186813, "grad_norm": 0.3375983238220215, "learning_rate": 0.0001520178050338729, "loss": 0.9407, "step": 430 }, { "epoch": 1.2087912087912087, "grad_norm": 0.35231560468673706, "learning_rate": 0.00014925937098546652, "loss": 0.9416, "step": 440 }, { "epoch": 1.2362637362637363, "grad_norm": 0.32709598541259766, "learning_rate": 0.00014645052555656431, "loss": 0.9248, "step": 450 }, { "epoch": 1.2362637362637363, "eval_loss": 0.9815701246261597, "eval_runtime": 120.0756, "eval_samples_per_second": 43.106, "eval_steps_per_second": 2.698, "step": 450 }, { "epoch": 1.2637362637362637, "grad_norm": 0.34425875544548035, "learning_rate": 0.00014359414328195703, "loss": 0.9344, "step": 460 }, { "epoch": 1.2912087912087913, "grad_norm": 0.3243270814418793, "learning_rate": 0.00014069314734500675, "loss": 0.9133, "step": 470 }, { "epoch": 1.304945054945055, "eval_loss": 0.975848913192749, "eval_runtime": 120.0935, "eval_samples_per_second": 43.1, "eval_steps_per_second": 2.698, "step": 475 }, { "epoch": 1.3186813186813187, "grad_norm": 0.3632093667984009, "learning_rate": 0.00013775050658609988, "loss": 0.9171, "step": 480 }, { "epoch": 1.3461538461538463, "grad_norm": 0.3164936900138855, "learning_rate": 0.0001347692324643759, "loss": 0.9181, "step": 490 }, { "epoch": 1.3736263736263736, "grad_norm": 0.3294355869293213, "learning_rate": 0.00013175237597584045, "loss": 0.9271, "step": 500 }, { "epoch": 1.3736263736263736, "eval_loss": 0.9721202254295349, "eval_runtime": 120.1002, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 500 }, { "epoch": 1.401098901098901, "grad_norm": 0.36658838391304016, "learning_rate": 0.00012870302453101657, "loss": 0.9124, "step": 510 }, { "epoch": 1.4285714285714286, "grad_norm": 0.327544242143631, "learning_rate": 0.0001256242987953306, "loss": 0.912, "step": 520 }, { "epoch": 1.4423076923076923, "eval_loss": 0.9657136797904968, "eval_runtime": 120.1092, "eval_samples_per_second": 43.094, "eval_steps_per_second": 2.698, "step": 525 }, { "epoch": 1.456043956043956, "grad_norm": 0.35890278220176697, "learning_rate": 0.00012251934949546447, "loss": 0.9224, "step": 530 }, { "epoch": 1.4835164835164836, "grad_norm": 0.3336729109287262, "learning_rate": 0.00011939135419494456, "loss": 0.9141, "step": 540 }, { "epoch": 1.510989010989011, "grad_norm": 0.3688167631626129, "learning_rate": 0.00011624351404226572, "loss": 0.9074, "step": 550 }, { "epoch": 1.510989010989011, "eval_loss": 0.9616743326187134, "eval_runtime": 120.0995, "eval_samples_per_second": 43.098, "eval_steps_per_second": 2.698, "step": 550 }, { "epoch": 1.5384615384615383, "grad_norm": 0.3265407681465149, "learning_rate": 0.00011307905049487855, "loss": 0.9093, "step": 560 }, { "epoch": 1.565934065934066, "grad_norm": 0.34465736150741577, "learning_rate": 0.00010990120202239324, "loss": 0.9106, "step": 570 }, { "epoch": 1.5796703296703298, "eval_loss": 0.9570587277412415, "eval_runtime": 120.0802, "eval_samples_per_second": 43.105, "eval_steps_per_second": 2.698, "step": 575 }, { "epoch": 1.5934065934065935, "grad_norm": 0.34950023889541626, "learning_rate": 0.00010671322079237307, "loss": 0.9229, "step": 580 }, { "epoch": 1.620879120879121, "grad_norm": 0.34105509519577026, "learning_rate": 0.00010351836934210957, "loss": 0.913, "step": 590 }, { "epoch": 1.6483516483516483, "grad_norm": 0.34221306443214417, "learning_rate": 0.00010031991723978574, "loss": 0.9001, "step": 600 }, { "epoch": 1.6483516483516483, "eval_loss": 0.953257143497467, "eval_runtime": 120.1011, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 600 }, { "epoch": 1.6758241758241759, "grad_norm": 0.3518592119216919, "learning_rate": 9.712113773844361e-05, "loss": 0.9051, "step": 610 }, { "epoch": 1.7032967032967035, "grad_norm": 0.32816919684410095, "learning_rate": 9.3925304426181e-05, "loss": 0.9098, "step": 620 }, { "epoch": 1.7170329670329672, "eval_loss": 0.9497827887535095, "eval_runtime": 120.0995, "eval_samples_per_second": 43.098, "eval_steps_per_second": 2.698, "step": 625 }, { "epoch": 1.7307692307692308, "grad_norm": 0.3299916982650757, "learning_rate": 9.073568787600539e-05, "loss": 0.9174, "step": 630 }, { "epoch": 1.7582417582417582, "grad_norm": 0.3412216305732727, "learning_rate": 8.755555229877294e-05, "loss": 0.9035, "step": 640 }, { "epoch": 1.7857142857142856, "grad_norm": 0.3167659044265747, "learning_rate": 8.438815220263941e-05, "loss": 0.891, "step": 650 }, { "epoch": 1.7857142857142856, "eval_loss": 0.9457467198371887, "eval_runtime": 120.0762, "eval_samples_per_second": 43.106, "eval_steps_per_second": 2.698, "step": 650 }, { "epoch": 1.8131868131868132, "grad_norm": 0.338913232088089, "learning_rate": 8.123672906243955e-05, "loss": 0.9033, "step": 660 }, { "epoch": 1.8406593406593408, "grad_norm": 0.3298161029815674, "learning_rate": 7.810450800240549e-05, "loss": 0.8877, "step": 670 }, { "epoch": 1.8543956043956045, "eval_loss": 0.9423750638961792, "eval_runtime": 120.1002, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 675 }, { "epoch": 1.8681318681318682, "grad_norm": 0.334606409072876, "learning_rate": 7.499469449561769e-05, "loss": 0.8988, "step": 680 }, { "epoch": 1.8956043956043955, "grad_norm": 0.34457749128341675, "learning_rate": 7.191047108356672e-05, "loss": 0.9191, "step": 690 }, { "epoch": 1.9230769230769231, "grad_norm": 0.3466154634952545, "learning_rate": 6.885499411918304e-05, "loss": 0.9, "step": 700 }, { "epoch": 1.9230769230769231, "eval_loss": 0.93968266248703, "eval_runtime": 120.1019, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 700 }, { "epoch": 1.9505494505494505, "grad_norm": 0.3775443136692047, "learning_rate": 6.583139053666745e-05, "loss": 0.8944, "step": 710 }, { "epoch": 1.978021978021978, "grad_norm": 0.34448450803756714, "learning_rate": 6.284275465142874e-05, "loss": 0.8841, "step": 720 }, { "epoch": 1.9917582417582418, "eval_loss": 0.9374962449073792, "eval_runtime": 120.0877, "eval_samples_per_second": 43.102, "eval_steps_per_second": 2.698, "step": 725 }, { "epoch": 2.0054945054945055, "grad_norm": 0.3557426333427429, "learning_rate": 5.989214499340267e-05, "loss": 0.8657, "step": 730 }, { "epoch": 2.032967032967033, "grad_norm": 0.3379885256290436, "learning_rate": 5.6982581176993335e-05, "loss": 0.8331, "step": 740 }, { "epoch": 2.0604395604395602, "grad_norm": 0.34215247631073, "learning_rate": 5.4117040810840246e-05, "loss": 0.828, "step": 750 }, { "epoch": 2.0604395604395602, "eval_loss": 0.9431478381156921, "eval_runtime": 120.0958, "eval_samples_per_second": 43.099, "eval_steps_per_second": 2.698, "step": 750 }, { "epoch": 2.087912087912088, "grad_norm": 0.360781192779541, "learning_rate": 5.129845645057372e-05, "loss": 0.8357, "step": 760 }, { "epoch": 2.1153846153846154, "grad_norm": 0.3744048774242401, "learning_rate": 4.8529712597676426e-05, "loss": 0.8232, "step": 770 }, { "epoch": 2.129120879120879, "eval_loss": 0.940683901309967, "eval_runtime": 120.0962, "eval_samples_per_second": 43.099, "eval_steps_per_second": 2.698, "step": 775 }, { "epoch": 2.142857142857143, "grad_norm": 0.3741873502731323, "learning_rate": 4.581364274752338e-05, "loss": 0.8351, "step": 780 }, { "epoch": 2.17032967032967, "grad_norm": 0.37572070956230164, "learning_rate": 4.315302648962066e-05, "loss": 0.8171, "step": 790 }, { "epoch": 2.197802197802198, "grad_norm": 0.365060418844223, "learning_rate": 4.055058666301087e-05, "loss": 0.8299, "step": 800 }, { "epoch": 2.197802197802198, "eval_loss": 0.9400618672370911, "eval_runtime": 120.1003, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 800 }, { "epoch": 2.2252747252747254, "grad_norm": 0.3707464933395386, "learning_rate": 3.800898656975599e-05, "loss": 0.8303, "step": 810 }, { "epoch": 2.2527472527472527, "grad_norm": 0.3521062433719635, "learning_rate": 3.553082724934973e-05, "loss": 0.8378, "step": 820 }, { "epoch": 2.2664835164835164, "eval_loss": 0.9390351176261902, "eval_runtime": 120.0755, "eval_samples_per_second": 43.106, "eval_steps_per_second": 2.698, "step": 825 }, { "epoch": 2.28021978021978, "grad_norm": 0.3930257260799408, "learning_rate": 3.3118644816848574e-05, "loss": 0.8265, "step": 830 }, { "epoch": 2.3076923076923075, "grad_norm": 0.37315958738327026, "learning_rate": 3.077490786744562e-05, "loss": 0.837, "step": 840 }, { "epoch": 2.3351648351648353, "grad_norm": 0.3660902976989746, "learning_rate": 2.8502014950143373e-05, "loss": 0.8328, "step": 850 }, { "epoch": 2.3351648351648353, "eval_loss": 0.9368348121643066, "eval_runtime": 120.1125, "eval_samples_per_second": 43.093, "eval_steps_per_second": 2.697, "step": 850 }, { "epoch": 2.3626373626373627, "grad_norm": 0.3826388418674469, "learning_rate": 2.6302292113110637e-05, "loss": 0.8202, "step": 860 }, { "epoch": 2.39010989010989, "grad_norm": 0.39540669322013855, "learning_rate": 2.4177990523236216e-05, "loss": 0.8284, "step": 870 }, { "epoch": 2.4038461538461537, "eval_loss": 0.9366135597229004, "eval_runtime": 120.1013, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 875 }, { "epoch": 2.4175824175824174, "grad_norm": 0.39717695116996765, "learning_rate": 2.213128416231468e-05, "loss": 0.8287, "step": 880 }, { "epoch": 2.4450549450549453, "grad_norm": 0.3701748847961426, "learning_rate": 2.0164267602222586e-05, "loss": 0.8298, "step": 890 }, { "epoch": 2.4725274725274726, "grad_norm": 0.3669058680534363, "learning_rate": 1.827895386136166e-05, "loss": 0.8258, "step": 900 }, { "epoch": 2.4725274725274726, "eval_loss": 0.9355611801147461, "eval_runtime": 120.0907, "eval_samples_per_second": 43.101, "eval_steps_per_second": 2.698, "step": 900 }, { "epoch": 2.5, "grad_norm": 0.3532375395298004, "learning_rate": 1.647727234456279e-05, "loss": 0.8428, "step": 910 }, { "epoch": 2.5274725274725274, "grad_norm": 0.35455724596977234, "learning_rate": 1.4761066868558914e-05, "loss": 0.825, "step": 920 }, { "epoch": 2.541208791208791, "eval_loss": 0.9347633123397827, "eval_runtime": 120.0905, "eval_samples_per_second": 43.101, "eval_steps_per_second": 2.698, "step": 925 }, { "epoch": 2.5549450549450547, "grad_norm": 0.3676817715167999, "learning_rate": 1.3132093775047615e-05, "loss": 0.8207, "step": 930 }, { "epoch": 2.5824175824175826, "grad_norm": 0.41484078764915466, "learning_rate": 1.1592020133274639e-05, "loss": 0.8388, "step": 940 }, { "epoch": 2.60989010989011, "grad_norm": 0.3522408604621887, "learning_rate": 1.0142422033977505e-05, "loss": 0.8259, "step": 950 }, { "epoch": 2.60989010989011, "eval_loss": 0.9338229298591614, "eval_runtime": 120.1021, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 950 }, { "epoch": 2.6373626373626373, "grad_norm": 0.34244734048843384, "learning_rate": 8.784782976435424e-06, "loss": 0.821, "step": 960 }, { "epoch": 2.6648351648351647, "grad_norm": 0.37353935837745667, "learning_rate": 7.520492350275876e-06, "loss": 0.8225, "step": 970 }, { "epoch": 2.678571428571429, "eval_loss": 0.9337417483329773, "eval_runtime": 120.1024, "eval_samples_per_second": 43.097, "eval_steps_per_second": 2.698, "step": 975 }, { "epoch": 2.6923076923076925, "grad_norm": 0.3811197876930237, "learning_rate": 6.350844013592061e-06, "loss": 0.8286, "step": 980 }, { "epoch": 2.71978021978022, "grad_norm": 0.3932379186153412, "learning_rate": 5.277034968825667e-06, "loss": 0.8372, "step": 990 }, { "epoch": 2.7472527472527473, "grad_norm": 0.3701821267604828, "learning_rate": 4.3001641377707125e-06, "loss": 0.8226, "step": 1000 }, { "epoch": 2.7472527472527473, "eval_loss": 0.9329955577850342, "eval_runtime": 120.0945, "eval_samples_per_second": 43.099, "eval_steps_per_second": 2.698, "step": 1000 }, { "epoch": 2.7747252747252746, "grad_norm": 0.3506335914134979, "learning_rate": 3.4212312369516497e-06, "loss": 0.8128, "step": 1010 }, { "epoch": 2.802197802197802, "grad_norm": 0.3670129179954529, "learning_rate": 2.6411357545269577e-06, "loss": 0.8239, "step": 1020 }, { "epoch": 2.8159340659340657, "eval_loss": 0.9329585433006287, "eval_runtime": 120.0978, "eval_samples_per_second": 43.098, "eval_steps_per_second": 2.698, "step": 1025 }, { "epoch": 2.82967032967033, "grad_norm": 0.35776689648628235, "learning_rate": 1.960676029764874e-06, "loss": 0.8437, "step": 1030 }, { "epoch": 2.857142857142857, "grad_norm": 0.3504193127155304, "learning_rate": 1.3805484360337906e-06, "loss": 0.8104, "step": 1040 }, { "epoch": 2.8846153846153846, "grad_norm": 0.366804301738739, "learning_rate": 9.013466681429994e-07, "loss": 0.8169, "step": 1050 }, { "epoch": 2.8846153846153846, "eval_loss": 0.9328103065490723, "eval_runtime": 120.1161, "eval_samples_per_second": 43.092, "eval_steps_per_second": 2.697, "step": 1050 }, { "epoch": 2.912087912087912, "grad_norm": 0.3706912100315094, "learning_rate": 5.235611347634172e-07, "loss": 0.8123, "step": 1060 }, { "epoch": 2.9395604395604398, "grad_norm": 0.3953019380569458, "learning_rate": 2.4757845654992397e-07, "loss": 0.8175, "step": 1070 }, { "epoch": 2.9532967032967035, "eval_loss": 0.9328309893608093, "eval_runtime": 120.1081, "eval_samples_per_second": 43.095, "eval_steps_per_second": 2.698, "step": 1075 }, { "epoch": 2.967032967032967, "grad_norm": 0.40803447365760803, "learning_rate": 7.368107047894812e-08, "loss": 0.8333, "step": 1080 }, { "epoch": 2.9945054945054945, "grad_norm": 0.3288993835449219, "learning_rate": 2.046940806244013e-09, "loss": 0.8162, "step": 1090 }, { "epoch": 3.0, "step": 1092, "total_flos": 3.7446797801931407e+18, "train_loss": 0.9387403691644634, "train_runtime": 13836.993, "train_samples_per_second": 10.1, "train_steps_per_second": 0.079 } ], "logging_steps": 10, "max_steps": 1092, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.7446797801931407e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }