{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 25, "global_step": 1092, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027472527472527472, "grad_norm": 0.6110167503356934, "learning_rate": 1.8181818181818182e-05, "loss": 1.3621, "step": 10 }, { "epoch": 0.054945054945054944, "grad_norm": 0.7437606453895569, "learning_rate": 3.6363636363636364e-05, "loss": 1.2798, "step": 20 }, { "epoch": 0.06868131868131869, "eval_loss": 1.2023646831512451, "eval_runtime": 153.095, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 25 }, { "epoch": 0.08241758241758242, "grad_norm": 0.3144157826900482, "learning_rate": 5.4545454545454546e-05, "loss": 1.164, "step": 30 }, { "epoch": 0.10989010989010989, "grad_norm": 0.29007333517074585, "learning_rate": 7.272727272727273e-05, "loss": 1.1436, "step": 40 }, { "epoch": 0.13736263736263737, "grad_norm": 0.26384684443473816, "learning_rate": 9.090909090909092e-05, "loss": 1.1131, "step": 50 }, { "epoch": 0.13736263736263737, "eval_loss": 1.1125513315200806, "eval_runtime": 153.085, "eval_samples_per_second": 33.811, "eval_steps_per_second": 2.116, "step": 50 }, { "epoch": 0.16483516483516483, "grad_norm": 0.3175796866416931, "learning_rate": 0.00010909090909090909, "loss": 1.1114, "step": 60 }, { "epoch": 0.19230769230769232, "grad_norm": 0.3293766379356384, "learning_rate": 0.00012727272727272728, "loss": 1.0756, "step": 70 }, { "epoch": 0.20604395604395603, "eval_loss": 1.0798790454864502, "eval_runtime": 153.089, "eval_samples_per_second": 33.81, "eval_steps_per_second": 2.116, "step": 75 }, { "epoch": 0.21978021978021978, "grad_norm": 0.35649096965789795, "learning_rate": 0.00014545454545454546, "loss": 1.072, "step": 80 }, { "epoch": 0.24725274725274726, "grad_norm": 0.3732157349586487, "learning_rate": 0.00016363636363636366, "loss": 1.0546, "step": 90 }, { "epoch": 0.27472527472527475, "grad_norm": 0.34071084856987, "learning_rate": 0.00018181818181818183, "loss": 1.0443, "step": 100 }, { "epoch": 0.27472527472527475, "eval_loss": 1.0595930814743042, "eval_runtime": 153.0762, "eval_samples_per_second": 33.813, "eval_steps_per_second": 2.117, "step": 100 }, { "epoch": 0.3021978021978022, "grad_norm": 0.34990689158439636, "learning_rate": 0.0002, "loss": 1.0549, "step": 110 }, { "epoch": 0.32967032967032966, "grad_norm": 0.31748640537261963, "learning_rate": 0.00019994883066969053, "loss": 1.0515, "step": 120 }, { "epoch": 0.3434065934065934, "eval_loss": 1.045371174812317, "eval_runtime": 153.0907, "eval_samples_per_second": 33.81, "eval_steps_per_second": 2.116, "step": 125 }, { "epoch": 0.35714285714285715, "grad_norm": 0.36638781428337097, "learning_rate": 0.00019979537504476944, "loss": 1.0425, "step": 130 }, { "epoch": 0.38461538461538464, "grad_norm": 0.33281397819519043, "learning_rate": 0.00019953979016966788, "loss": 1.028, "step": 140 }, { "epoch": 0.41208791208791207, "grad_norm": 0.3851754069328308, "learning_rate": 0.0001991823376065238, "loss": 1.0212, "step": 150 }, { "epoch": 0.41208791208791207, "eval_loss": 1.0323785543441772, "eval_runtime": 153.099, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 150 }, { "epoch": 0.43956043956043955, "grad_norm": 0.30995821952819824, "learning_rate": 0.00019872338316750265, "loss": 1.0146, "step": 160 }, { "epoch": 0.46703296703296704, "grad_norm": 0.3119589686393738, "learning_rate": 0.00019816339654043022, "loss": 1.0213, "step": 170 }, { "epoch": 0.4807692307692308, "eval_loss": 1.021622896194458, "eval_runtime": 153.0999, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 175 }, { "epoch": 0.4945054945054945, "grad_norm": 0.3268902599811554, "learning_rate": 0.00019750295080812023, "loss": 1.0352, "step": 180 }, { "epoch": 0.521978021978022, "grad_norm": 0.3450993001461029, "learning_rate": 0.0001967427218618893, "loss": 1.0003, "step": 190 }, { "epoch": 0.5494505494505495, "grad_norm": 0.33410361409187317, "learning_rate": 0.0001958834877098586, "loss": 1.0152, "step": 200 }, { "epoch": 0.5494505494505495, "eval_loss": 1.0108906030654907, "eval_runtime": 153.0989, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 200 }, { "epoch": 0.5769230769230769, "grad_norm": 0.3393489718437195, "learning_rate": 0.00019492612768075092, "loss": 1.0242, "step": 210 }, { "epoch": 0.6043956043956044, "grad_norm": 0.317294716835022, "learning_rate": 0.0001938716215239974, "loss": 1.0221, "step": 220 }, { "epoch": 0.6181318681318682, "eval_loss": 1.001078724861145, "eval_runtime": 153.0943, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 225 }, { "epoch": 0.6318681318681318, "grad_norm": 0.3547685742378235, "learning_rate": 0.00019272104840707487, "loss": 1.0029, "step": 230 }, { "epoch": 0.6593406593406593, "grad_norm": 0.33442381024360657, "learning_rate": 0.00019147558581110078, "loss": 0.9949, "step": 240 }, { "epoch": 0.6868131868131868, "grad_norm": 0.33157438039779663, "learning_rate": 0.00019013650832581423, "loss": 1.0017, "step": 250 }, { "epoch": 0.6868131868131868, "eval_loss": 0.9922270774841309, "eval_runtime": 153.1031, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 250 }, { "epoch": 0.7142857142857143, "grad_norm": 0.3358522951602936, "learning_rate": 0.0001887051863451784, "loss": 1.0029, "step": 260 }, { "epoch": 0.7417582417582418, "grad_norm": 0.35657626390457153, "learning_rate": 0.00018718308466493744, "loss": 0.974, "step": 270 }, { "epoch": 0.7554945054945055, "eval_loss": 0.9837812185287476, "eval_runtime": 153.0959, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 275 }, { "epoch": 0.7692307692307693, "grad_norm": 0.33967357873916626, "learning_rate": 0.00018557176098356405, "loss": 0.9847, "step": 280 }, { "epoch": 0.7967032967032966, "grad_norm": 0.33605992794036865, "learning_rate": 0.00018387286430813208, "loss": 0.9794, "step": 290 }, { "epoch": 0.8241758241758241, "grad_norm": 0.34878063201904297, "learning_rate": 0.00018208813326674444, "loss": 0.9806, "step": 300 }, { "epoch": 0.8241758241758241, "eval_loss": 0.9748955965042114, "eval_runtime": 153.0908, "eval_samples_per_second": 33.81, "eval_steps_per_second": 2.116, "step": 300 }, { "epoch": 0.8516483516483516, "grad_norm": 0.34370192885398865, "learning_rate": 0.00018021939432924454, "loss": 0.9691, "step": 310 }, { "epoch": 0.8791208791208791, "grad_norm": 0.32899904251098633, "learning_rate": 0.00017826855993803147, "loss": 0.9666, "step": 320 }, { "epoch": 0.8928571428571429, "eval_loss": 0.9662107229232788, "eval_runtime": 153.1012, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 325 }, { "epoch": 0.9065934065934066, "grad_norm": 0.36374330520629883, "learning_rate": 0.00017623762655089207, "loss": 0.9528, "step": 330 }, { "epoch": 0.9340659340659341, "grad_norm": 0.3277423679828644, "learning_rate": 0.00017412867259785286, "loss": 0.9645, "step": 340 }, { "epoch": 0.9615384615384616, "grad_norm": 0.36830317974090576, "learning_rate": 0.00017194385635414244, "loss": 0.9583, "step": 350 }, { "epoch": 0.9615384615384616, "eval_loss": 0.9582132697105408, "eval_runtime": 153.0942, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 350 }, { "epoch": 0.989010989010989, "grad_norm": 0.37403979897499084, "learning_rate": 0.00016968541373144156, "loss": 0.9524, "step": 360 }, { "epoch": 1.0164835164835164, "grad_norm": 0.38653823733329773, "learning_rate": 0.00016735565598968114, "loss": 0.8974, "step": 370 }, { "epoch": 1.0302197802197801, "eval_loss": 0.9573748707771301, "eval_runtime": 153.103, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 375 }, { "epoch": 1.043956043956044, "grad_norm": 0.37391167879104614, "learning_rate": 0.0001649569673717298, "loss": 0.8925, "step": 380 }, { "epoch": 1.0714285714285714, "grad_norm": 0.38612258434295654, "learning_rate": 0.0001624918026633916, "loss": 0.8534, "step": 390 }, { "epoch": 1.098901098901099, "grad_norm": 0.3503981828689575, "learning_rate": 0.00015996268468121102, "loss": 0.8835, "step": 400 }, { "epoch": 1.098901098901099, "eval_loss": 0.9542333483695984, "eval_runtime": 153.1108, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 400 }, { "epoch": 1.1263736263736264, "grad_norm": 0.3498053252696991, "learning_rate": 0.00015737220169065655, "loss": 0.882, "step": 410 }, { "epoch": 1.1538461538461537, "grad_norm": 0.3692995309829712, "learning_rate": 0.00015472300475732426, "loss": 0.8702, "step": 420 }, { "epoch": 1.1675824175824177, "eval_loss": 0.9501178860664368, "eval_runtime": 153.1287, "eval_samples_per_second": 33.802, "eval_steps_per_second": 2.116, "step": 425 }, { "epoch": 1.1813186813186813, "grad_norm": 0.3819359540939331, "learning_rate": 0.0001520178050338729, "loss": 0.8845, "step": 430 }, { "epoch": 1.2087912087912087, "grad_norm": 0.39302101731300354, "learning_rate": 0.00014925937098546652, "loss": 0.8954, "step": 440 }, { "epoch": 1.2362637362637363, "grad_norm": 0.3756279945373535, "learning_rate": 0.00014645052555656431, "loss": 0.8682, "step": 450 }, { "epoch": 1.2362637362637363, "eval_loss": 0.9446578025817871, "eval_runtime": 153.1248, "eval_samples_per_second": 33.803, "eval_steps_per_second": 2.116, "step": 450 }, { "epoch": 1.2637362637362637, "grad_norm": 0.3916327655315399, "learning_rate": 0.00014359414328195703, "loss": 0.8782, "step": 460 }, { "epoch": 1.2912087912087913, "grad_norm": 0.36019015312194824, "learning_rate": 0.00014069314734500675, "loss": 0.8604, "step": 470 }, { "epoch": 1.304945054945055, "eval_loss": 0.9429782629013062, "eval_runtime": 153.1539, "eval_samples_per_second": 33.796, "eval_steps_per_second": 2.116, "step": 475 }, { "epoch": 1.3186813186813187, "grad_norm": 0.3944477140903473, "learning_rate": 0.00013775050658609988, "loss": 0.8556, "step": 480 }, { "epoch": 1.3461538461538463, "grad_norm": 0.3486022651195526, "learning_rate": 0.0001347692324643759, "loss": 0.8704, "step": 490 }, { "epoch": 1.3736263736263736, "grad_norm": 0.3936191499233246, "learning_rate": 0.00013175237597584045, "loss": 0.8778, "step": 500 }, { "epoch": 1.3736263736263736, "eval_loss": 0.9382254481315613, "eval_runtime": 153.1105, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 500 }, { "epoch": 1.401098901098901, "grad_norm": 0.43809008598327637, "learning_rate": 0.00012870302453101657, "loss": 0.8634, "step": 510 }, { "epoch": 1.4285714285714286, "grad_norm": 0.3764894902706146, "learning_rate": 0.0001256242987953306, "loss": 0.8659, "step": 520 }, { "epoch": 1.4423076923076923, "eval_loss": 0.9333315491676331, "eval_runtime": 153.1074, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 525 }, { "epoch": 1.456043956043956, "grad_norm": 0.40227949619293213, "learning_rate": 0.00012251934949546447, "loss": 0.8686, "step": 530 }, { "epoch": 1.4835164835164836, "grad_norm": 0.3839854598045349, "learning_rate": 0.00011939135419494456, "loss": 0.8718, "step": 540 }, { "epoch": 1.510989010989011, "grad_norm": 0.40862157940864563, "learning_rate": 0.00011624351404226572, "loss": 0.862, "step": 550 }, { "epoch": 1.510989010989011, "eval_loss": 0.9310413599014282, "eval_runtime": 153.0992, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 550 }, { "epoch": 1.5384615384615383, "grad_norm": 0.4031030535697937, "learning_rate": 0.00011307905049487855, "loss": 0.8614, "step": 560 }, { "epoch": 1.565934065934066, "grad_norm": 0.40243202447891235, "learning_rate": 0.00010990120202239324, "loss": 0.8625, "step": 570 }, { "epoch": 1.5796703296703298, "eval_loss": 0.9275878071784973, "eval_runtime": 153.1078, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 575 }, { "epoch": 1.5934065934065935, "grad_norm": 0.3883321285247803, "learning_rate": 0.00010671322079237307, "loss": 0.8747, "step": 580 }, { "epoch": 1.620879120879121, "grad_norm": 0.37059128284454346, "learning_rate": 0.00010351836934210957, "loss": 0.8656, "step": 590 }, { "epoch": 1.6483516483516483, "grad_norm": 0.3715282380580902, "learning_rate": 0.00010031991723978574, "loss": 0.848, "step": 600 }, { "epoch": 1.6483516483516483, "eval_loss": 0.924781322479248, "eval_runtime": 153.108, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 600 }, { "epoch": 1.6758241758241759, "grad_norm": 0.4046836197376251, "learning_rate": 9.712113773844361e-05, "loss": 0.8641, "step": 610 }, { "epoch": 1.7032967032967035, "grad_norm": 0.38040101528167725, "learning_rate": 9.3925304426181e-05, "loss": 0.8662, "step": 620 }, { "epoch": 1.7170329670329672, "eval_loss": 0.9215436577796936, "eval_runtime": 153.1028, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 625 }, { "epoch": 1.7307692307692308, "grad_norm": 0.3663492202758789, "learning_rate": 9.073568787600539e-05, "loss": 0.8751, "step": 630 }, { "epoch": 1.7582417582417582, "grad_norm": 0.3878580629825592, "learning_rate": 8.755555229877294e-05, "loss": 0.859, "step": 640 }, { "epoch": 1.7857142857142856, "grad_norm": 0.3590506315231323, "learning_rate": 8.438815220263941e-05, "loss": 0.8438, "step": 650 }, { "epoch": 1.7857142857142856, "eval_loss": 0.919314444065094, "eval_runtime": 153.0972, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 650 }, { "epoch": 1.8131868131868132, "grad_norm": 0.377250999212265, "learning_rate": 8.123672906243955e-05, "loss": 0.8619, "step": 660 }, { "epoch": 1.8406593406593408, "grad_norm": 0.3786846399307251, "learning_rate": 7.810450800240549e-05, "loss": 0.8457, "step": 670 }, { "epoch": 1.8543956043956045, "eval_loss": 0.9163983464241028, "eval_runtime": 153.0925, "eval_samples_per_second": 33.81, "eval_steps_per_second": 2.116, "step": 675 }, { "epoch": 1.8681318681318682, "grad_norm": 0.3944201171398163, "learning_rate": 7.499469449561769e-05, "loss": 0.8548, "step": 680 }, { "epoch": 1.8956043956043955, "grad_norm": 0.4008020758628845, "learning_rate": 7.191047108356672e-05, "loss": 0.8776, "step": 690 }, { "epoch": 1.9230769230769231, "grad_norm": 0.4330127239227295, "learning_rate": 6.885499411918304e-05, "loss": 0.8629, "step": 700 }, { "epoch": 1.9230769230769231, "eval_loss": 0.9145733118057251, "eval_runtime": 153.1116, "eval_samples_per_second": 33.805, "eval_steps_per_second": 2.116, "step": 700 }, { "epoch": 1.9505494505494505, "grad_norm": 0.4149663746356964, "learning_rate": 6.583139053666745e-05, "loss": 0.8534, "step": 710 }, { "epoch": 1.978021978021978, "grad_norm": 0.4074181616306305, "learning_rate": 6.284275465142874e-05, "loss": 0.8401, "step": 720 }, { "epoch": 1.9917582417582418, "eval_loss": 0.912276566028595, "eval_runtime": 153.0994, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 725 }, { "epoch": 2.0054945054945055, "grad_norm": 0.4105566740036011, "learning_rate": 5.989214499340267e-05, "loss": 0.8125, "step": 730 }, { "epoch": 2.032967032967033, "grad_norm": 0.4072117507457733, "learning_rate": 5.6982581176993335e-05, "loss": 0.7576, "step": 740 }, { "epoch": 2.0604395604395602, "grad_norm": 0.4275268018245697, "learning_rate": 5.4117040810840246e-05, "loss": 0.7507, "step": 750 }, { "epoch": 2.0604395604395602, "eval_loss": 0.9363195300102234, "eval_runtime": 153.1235, "eval_samples_per_second": 33.803, "eval_steps_per_second": 2.116, "step": 750 }, { "epoch": 2.087912087912088, "grad_norm": 0.4345867335796356, "learning_rate": 5.129845645057372e-05, "loss": 0.7568, "step": 760 }, { "epoch": 2.1153846153846154, "grad_norm": 0.4414556324481964, "learning_rate": 4.8529712597676426e-05, "loss": 0.7442, "step": 770 }, { "epoch": 2.129120879120879, "eval_loss": 0.9283037781715393, "eval_runtime": 153.1101, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 775 }, { "epoch": 2.142857142857143, "grad_norm": 0.45460888743400574, "learning_rate": 4.581364274752338e-05, "loss": 0.7643, "step": 780 }, { "epoch": 2.17032967032967, "grad_norm": 0.4608997106552124, "learning_rate": 4.315302648962066e-05, "loss": 0.7394, "step": 790 }, { "epoch": 2.197802197802198, "grad_norm": 0.4369276165962219, "learning_rate": 4.055058666301087e-05, "loss": 0.7561, "step": 800 }, { "epoch": 2.197802197802198, "eval_loss": 0.9290263652801514, "eval_runtime": 153.1091, "eval_samples_per_second": 33.806, "eval_steps_per_second": 2.116, "step": 800 }, { "epoch": 2.2252747252747254, "grad_norm": 0.4475458264350891, "learning_rate": 3.800898656975599e-05, "loss": 0.7534, "step": 810 }, { "epoch": 2.2527472527472527, "grad_norm": 0.43159323930740356, "learning_rate": 3.553082724934973e-05, "loss": 0.759, "step": 820 }, { "epoch": 2.2664835164835164, "eval_loss": 0.9296738505363464, "eval_runtime": 153.117, "eval_samples_per_second": 33.804, "eval_steps_per_second": 2.116, "step": 825 }, { "epoch": 2.28021978021978, "grad_norm": 0.4822545647621155, "learning_rate": 3.3118644816848574e-05, "loss": 0.75, "step": 830 }, { "epoch": 2.3076923076923075, "grad_norm": 0.4617106318473816, "learning_rate": 3.077490786744562e-05, "loss": 0.7616, "step": 840 }, { "epoch": 2.3351648351648353, "grad_norm": 0.45311427116394043, "learning_rate": 2.8502014950143373e-05, "loss": 0.756, "step": 850 }, { "epoch": 2.3351648351648353, "eval_loss": 0.9292080998420715, "eval_runtime": 153.1045, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 850 }, { "epoch": 2.3626373626373627, "grad_norm": 0.4627811312675476, "learning_rate": 2.6302292113110637e-05, "loss": 0.748, "step": 860 }, { "epoch": 2.39010989010989, "grad_norm": 0.47928130626678467, "learning_rate": 2.4177990523236216e-05, "loss": 0.7568, "step": 870 }, { "epoch": 2.4038461538461537, "eval_loss": 0.9285762310028076, "eval_runtime": 153.1307, "eval_samples_per_second": 33.801, "eval_steps_per_second": 2.116, "step": 875 }, { "epoch": 2.4175824175824174, "grad_norm": 0.47023048996925354, "learning_rate": 2.213128416231468e-05, "loss": 0.7543, "step": 880 }, { "epoch": 2.4450549450549453, "grad_norm": 0.4541950821876526, "learning_rate": 2.0164267602222586e-05, "loss": 0.7547, "step": 890 }, { "epoch": 2.4725274725274726, "grad_norm": 0.4479961693286896, "learning_rate": 1.827895386136166e-05, "loss": 0.7555, "step": 900 }, { "epoch": 2.4725274725274726, "eval_loss": 0.9276696443557739, "eval_runtime": 153.1137, "eval_samples_per_second": 33.805, "eval_steps_per_second": 2.116, "step": 900 }, { "epoch": 2.5, "grad_norm": 0.4365418255329132, "learning_rate": 1.647727234456279e-05, "loss": 0.7639, "step": 910 }, { "epoch": 2.5274725274725274, "grad_norm": 0.4260055720806122, "learning_rate": 1.4761066868558914e-05, "loss": 0.7489, "step": 920 }, { "epoch": 2.541208791208791, "eval_loss": 0.9275524020195007, "eval_runtime": 153.1015, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 925 }, { "epoch": 2.5549450549450547, "grad_norm": 0.44165608286857605, "learning_rate": 1.3132093775047615e-05, "loss": 0.7487, "step": 930 }, { "epoch": 2.5824175824175826, "grad_norm": 0.49400794506073, "learning_rate": 1.1592020133274639e-05, "loss": 0.7635, "step": 940 }, { "epoch": 2.60989010989011, "grad_norm": 0.43534478545188904, "learning_rate": 1.0142422033977505e-05, "loss": 0.752, "step": 950 }, { "epoch": 2.60989010989011, "eval_loss": 0.9266554117202759, "eval_runtime": 153.0881, "eval_samples_per_second": 33.811, "eval_steps_per_second": 2.116, "step": 950 }, { "epoch": 2.6373626373626373, "grad_norm": 0.41752126812934875, "learning_rate": 8.784782976435424e-06, "loss": 0.745, "step": 960 }, { "epoch": 2.6648351648351647, "grad_norm": 0.4589076638221741, "learning_rate": 7.520492350275876e-06, "loss": 0.7443, "step": 970 }, { "epoch": 2.678571428571429, "eval_loss": 0.9264477491378784, "eval_runtime": 153.1051, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 975 }, { "epoch": 2.6923076923076925, "grad_norm": 0.46214237809181213, "learning_rate": 6.350844013592061e-06, "loss": 0.7527, "step": 980 }, { "epoch": 2.71978021978022, "grad_norm": 0.4873000383377075, "learning_rate": 5.277034968825667e-06, "loss": 0.7635, "step": 990 }, { "epoch": 2.7472527472527473, "grad_norm": 0.4586065113544464, "learning_rate": 4.3001641377707125e-06, "loss": 0.7477, "step": 1000 }, { "epoch": 2.7472527472527473, "eval_loss": 0.9256552457809448, "eval_runtime": 153.1027, "eval_samples_per_second": 33.807, "eval_steps_per_second": 2.116, "step": 1000 }, { "epoch": 2.7747252747252746, "grad_norm": 0.42659422755241394, "learning_rate": 3.4212312369516497e-06, "loss": 0.7328, "step": 1010 }, { "epoch": 2.802197802197802, "grad_norm": 0.45557713508605957, "learning_rate": 2.6411357545269577e-06, "loss": 0.7496, "step": 1020 }, { "epoch": 2.8159340659340657, "eval_loss": 0.9259692430496216, "eval_runtime": 153.0995, "eval_samples_per_second": 33.808, "eval_steps_per_second": 2.116, "step": 1025 }, { "epoch": 2.82967032967033, "grad_norm": 0.44193485379219055, "learning_rate": 1.960676029764874e-06, "loss": 0.7651, "step": 1030 }, { "epoch": 2.857142857142857, "grad_norm": 0.4308880865573883, "learning_rate": 1.3805484360337906e-06, "loss": 0.7364, "step": 1040 }, { "epoch": 2.8846153846153846, "grad_norm": 0.4569310247898102, "learning_rate": 9.013466681429994e-07, "loss": 0.7417, "step": 1050 }, { "epoch": 2.8846153846153846, "eval_loss": 0.9257997274398804, "eval_runtime": 153.0972, "eval_samples_per_second": 33.809, "eval_steps_per_second": 2.116, "step": 1050 }, { "epoch": 2.912087912087912, "grad_norm": 0.4580983519554138, "learning_rate": 5.235611347634172e-07, "loss": 0.737, "step": 1060 }, { "epoch": 2.9395604395604398, "grad_norm": 0.4826437830924988, "learning_rate": 2.4757845654992397e-07, "loss": 0.7438, "step": 1070 }, { "epoch": 2.9532967032967035, "eval_loss": 0.9256556034088135, "eval_runtime": 153.115, "eval_samples_per_second": 33.805, "eval_steps_per_second": 2.116, "step": 1075 }, { "epoch": 2.967032967032967, "grad_norm": 0.5047232508659363, "learning_rate": 7.368107047894812e-08, "loss": 0.7651, "step": 1080 }, { "epoch": 2.9945054945054945, "grad_norm": 0.41768553853034973, "learning_rate": 2.046940806244013e-09, "loss": 0.7437, "step": 1090 }, { "epoch": 3.0, "step": 1092, "total_flos": 3.5117817586998313e+18, "train_loss": 0.8853066019959502, "train_runtime": 17390.2478, "train_samples_per_second": 8.036, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 1092, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5117817586998313e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }