| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9566929133858268, | |
| "eval_steps": 16, | |
| "global_step": 126, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015748031496062992, | |
| "grad_norm": 0.034481361508369446, | |
| "learning_rate": 4e-05, | |
| "loss": 0.1412, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.015748031496062992, | |
| "eval_loss": 0.1612786203622818, | |
| "eval_runtime": 64.5157, | |
| "eval_samples_per_second": 7.812, | |
| "eval_steps_per_second": 0.977, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.031496062992125984, | |
| "grad_norm": 0.029317770153284073, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1191, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.047244094488188976, | |
| "grad_norm": 0.036621659994125366, | |
| "learning_rate": 0.00012, | |
| "loss": 0.1369, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.06299212598425197, | |
| "grad_norm": 0.04425783455371857, | |
| "learning_rate": 0.00016, | |
| "loss": 0.1321, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.07874015748031496, | |
| "grad_norm": 0.05247063934803009, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1285, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09448818897637795, | |
| "grad_norm": 0.03902214765548706, | |
| "learning_rate": 0.00019996629653035126, | |
| "loss": 0.1004, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.11023622047244094, | |
| "grad_norm": 0.03752463683485985, | |
| "learning_rate": 0.00019986520883988232, | |
| "loss": 0.0985, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.12598425196850394, | |
| "grad_norm": 0.03061060793697834, | |
| "learning_rate": 0.00019969680506871137, | |
| "loss": 0.0912, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.14173228346456693, | |
| "grad_norm": 0.034427180886268616, | |
| "learning_rate": 0.00019946119873266613, | |
| "loss": 0.0836, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 0.03106631338596344, | |
| "learning_rate": 0.00019915854864676664, | |
| "loss": 0.0734, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1732283464566929, | |
| "grad_norm": 0.02498232200741768, | |
| "learning_rate": 0.00019878905881817252, | |
| "loss": 0.0729, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1889763779527559, | |
| "grad_norm": 0.03798564895987511, | |
| "learning_rate": 0.00019835297830866826, | |
| "loss": 0.0694, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2047244094488189, | |
| "grad_norm": 0.046124912798404694, | |
| "learning_rate": 0.00019785060106677818, | |
| "loss": 0.0833, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.2204724409448819, | |
| "grad_norm": 0.02981509082019329, | |
| "learning_rate": 0.00019728226572962473, | |
| "loss": 0.0713, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.23622047244094488, | |
| "grad_norm": 0.02461801841855049, | |
| "learning_rate": 0.0001966483553946637, | |
| "loss": 0.0657, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.25196850393700787, | |
| "grad_norm": 0.04344266653060913, | |
| "learning_rate": 0.00019594929736144976, | |
| "loss": 0.0635, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.25196850393700787, | |
| "eval_loss": 0.06579381227493286, | |
| "eval_runtime": 64.6166, | |
| "eval_samples_per_second": 7.8, | |
| "eval_steps_per_second": 0.975, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2677165354330709, | |
| "grad_norm": 0.0320642925798893, | |
| "learning_rate": 0.00019518556284360696, | |
| "loss": 0.0656, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.28346456692913385, | |
| "grad_norm": 0.028899891301989555, | |
| "learning_rate": 0.0001943576666511982, | |
| "loss": 0.0462, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.2992125984251969, | |
| "grad_norm": 0.02383616380393505, | |
| "learning_rate": 0.0001934661668437073, | |
| "loss": 0.0649, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 0.03346535563468933, | |
| "learning_rate": 0.0001925116643538684, | |
| "loss": 0.0546, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.33070866141732286, | |
| "grad_norm": 0.020454615354537964, | |
| "learning_rate": 0.00019149480258259533, | |
| "loss": 0.0538, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3464566929133858, | |
| "grad_norm": 0.02081696316599846, | |
| "learning_rate": 0.00019041626696528503, | |
| "loss": 0.0526, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.36220472440944884, | |
| "grad_norm": 0.028128350153565407, | |
| "learning_rate": 0.0001892767845097864, | |
| "loss": 0.0593, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.3779527559055118, | |
| "grad_norm": 0.015519126318395138, | |
| "learning_rate": 0.00018807712330634642, | |
| "loss": 0.0528, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.3937007874015748, | |
| "grad_norm": 0.03593792766332626, | |
| "learning_rate": 0.0001868180920098644, | |
| "loss": 0.0481, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4094488188976378, | |
| "grad_norm": 0.015408644452691078, | |
| "learning_rate": 0.00018550053929480202, | |
| "loss": 0.0479, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.4251968503937008, | |
| "grad_norm": 0.021226301789283752, | |
| "learning_rate": 0.00018412535328311814, | |
| "loss": 0.054, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.4409448818897638, | |
| "grad_norm": 0.01717953570187092, | |
| "learning_rate": 0.0001826934609456129, | |
| "loss": 0.0523, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.4566929133858268, | |
| "grad_norm": 0.019626960158348083, | |
| "learning_rate": 0.00018120582747708502, | |
| "loss": 0.0512, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 0.019186396151781082, | |
| "learning_rate": 0.0001796634556457236, | |
| "loss": 0.05, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4881889763779528, | |
| "grad_norm": 0.014989328570663929, | |
| "learning_rate": 0.0001780673851171728, | |
| "loss": 0.0441, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5039370078740157, | |
| "grad_norm": 0.012519012205302715, | |
| "learning_rate": 0.00017641869175372493, | |
| "loss": 0.0459, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5039370078740157, | |
| "eval_loss": 0.056131936609745026, | |
| "eval_runtime": 64.4985, | |
| "eval_samples_per_second": 7.814, | |
| "eval_steps_per_second": 0.977, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5196850393700787, | |
| "grad_norm": 0.01598576456308365, | |
| "learning_rate": 0.00017471848688911464, | |
| "loss": 0.0496, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5354330708661418, | |
| "grad_norm": 0.017361309379339218, | |
| "learning_rate": 0.000172967916579403, | |
| "loss": 0.0534, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5511811023622047, | |
| "grad_norm": 0.021230200305581093, | |
| "learning_rate": 0.00017116816083045602, | |
| "loss": 0.0505, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5669291338582677, | |
| "grad_norm": 0.01624094881117344, | |
| "learning_rate": 0.0001693204328025389, | |
| "loss": 0.0568, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.5826771653543307, | |
| "grad_norm": 0.014916475862264633, | |
| "learning_rate": 0.00016742597799256182, | |
| "loss": 0.0542, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.5984251968503937, | |
| "grad_norm": 0.013211382552981377, | |
| "learning_rate": 0.00016548607339452853, | |
| "loss": 0.0507, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6141732283464567, | |
| "grad_norm": 0.01305565144866705, | |
| "learning_rate": 0.00016350202663875386, | |
| "loss": 0.0387, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.011459614150226116, | |
| "learning_rate": 0.0001614751751104301, | |
| "loss": 0.0433, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6456692913385826, | |
| "grad_norm": 0.014712609350681305, | |
| "learning_rate": 0.00015940688504813662, | |
| "loss": 0.0571, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6614173228346457, | |
| "grad_norm": 0.015662657096982002, | |
| "learning_rate": 0.00015729855062290022, | |
| "loss": 0.0504, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.6771653543307087, | |
| "grad_norm": 0.011235736310482025, | |
| "learning_rate": 0.00015515159299842707, | |
| "loss": 0.0453, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.6929133858267716, | |
| "grad_norm": 0.011984420008957386, | |
| "learning_rate": 0.00015296745937313987, | |
| "loss": 0.0402, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7086614173228346, | |
| "grad_norm": 0.010523953475058079, | |
| "learning_rate": 0.00015074762200466556, | |
| "loss": 0.036, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7244094488188977, | |
| "grad_norm": 0.013540665619075298, | |
| "learning_rate": 0.00014849357721743168, | |
| "loss": 0.0346, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.7401574803149606, | |
| "grad_norm": 0.012998640537261963, | |
| "learning_rate": 0.00014620684439403962, | |
| "loss": 0.0468, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7559055118110236, | |
| "grad_norm": 0.01443515345454216, | |
| "learning_rate": 0.0001438889649510956, | |
| "loss": 0.0453, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.7559055118110236, | |
| "eval_loss": 0.05216333642601967, | |
| "eval_runtime": 64.5137, | |
| "eval_samples_per_second": 7.812, | |
| "eval_steps_per_second": 0.977, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.7716535433070866, | |
| "grad_norm": 0.01463907677680254, | |
| "learning_rate": 0.00014154150130018866, | |
| "loss": 0.0526, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 0.01614455319941044, | |
| "learning_rate": 0.00013916603579471705, | |
| "loss": 0.0484, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8031496062992126, | |
| "grad_norm": 0.014042153023183346, | |
| "learning_rate": 0.000136764169663272, | |
| "loss": 0.0419, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8188976377952756, | |
| "grad_norm": 0.015309924259781837, | |
| "learning_rate": 0.00013433752193029886, | |
| "loss": 0.0425, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8346456692913385, | |
| "grad_norm": 0.018054217100143433, | |
| "learning_rate": 0.00013188772832476188, | |
| "loss": 0.0426, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.8503937007874016, | |
| "grad_norm": 0.012343033216893673, | |
| "learning_rate": 0.00012941644017754964, | |
| "loss": 0.0448, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.8661417322834646, | |
| "grad_norm": 0.012457596138119698, | |
| "learning_rate": 0.00012692532330836346, | |
| "loss": 0.0451, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.8818897637795275, | |
| "grad_norm": 0.013512413017451763, | |
| "learning_rate": 0.00012441605690283915, | |
| "loss": 0.0413, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.8976377952755905, | |
| "grad_norm": 0.013424846343696117, | |
| "learning_rate": 0.0001218903323806595, | |
| "loss": 0.0441, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.9133858267716536, | |
| "grad_norm": 0.014157367870211601, | |
| "learning_rate": 0.00011934985225541998, | |
| "loss": 0.0443, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.9291338582677166, | |
| "grad_norm": 0.0130110839381814, | |
| "learning_rate": 0.00011679632898701649, | |
| "loss": 0.0478, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.012677576392889023, | |
| "learning_rate": 0.00011423148382732853, | |
| "loss": 0.0399, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9606299212598425, | |
| "grad_norm": 0.01409006118774414, | |
| "learning_rate": 0.00011165704565997593, | |
| "loss": 0.0481, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.9763779527559056, | |
| "grad_norm": 0.013535700738430023, | |
| "learning_rate": 0.00010907474983493144, | |
| "loss": 0.0406, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.9921259842519685, | |
| "grad_norm": 0.014210895635187626, | |
| "learning_rate": 0.0001064863369987743, | |
| "loss": 0.0425, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0078740157480315, | |
| "grad_norm": 0.014430968090891838, | |
| "learning_rate": 0.00010389355192137377, | |
| "loss": 0.0483, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0078740157480315, | |
| "eval_loss": 0.049744635820388794, | |
| "eval_runtime": 64.598, | |
| "eval_samples_per_second": 7.802, | |
| "eval_steps_per_second": 0.975, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0236220472440944, | |
| "grad_norm": 0.0142066590487957, | |
| "learning_rate": 0.0001012981423197931, | |
| "loss": 0.0391, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.0118110236220472, | |
| "grad_norm": 0.013278558850288391, | |
| "learning_rate": 9.870185768020693e-05, | |
| "loss": 0.045, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.0275590551181102, | |
| "grad_norm": 0.01264102477580309, | |
| "learning_rate": 9.610644807862625e-05, | |
| "loss": 0.0396, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.0433070866141732, | |
| "grad_norm": 0.014591066166758537, | |
| "learning_rate": 9.35136630012257e-05, | |
| "loss": 0.0443, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.0590551181102361, | |
| "grad_norm": 0.013674317859113216, | |
| "learning_rate": 9.092525016506858e-05, | |
| "loss": 0.0493, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.0748031496062993, | |
| "grad_norm": 0.0148893091827631, | |
| "learning_rate": 8.83429543400241e-05, | |
| "loss": 0.0412, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.0905511811023623, | |
| "grad_norm": 0.01666112430393696, | |
| "learning_rate": 8.57685161726715e-05, | |
| "loss": 0.0476, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.1062992125984252, | |
| "grad_norm": 0.013044373132288456, | |
| "learning_rate": 8.320367101298351e-05, | |
| "loss": 0.0391, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.1220472440944882, | |
| "grad_norm": 0.014822134748101234, | |
| "learning_rate": 8.065014774458003e-05, | |
| "loss": 0.0406, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.1377952755905512, | |
| "grad_norm": 0.013880250044167042, | |
| "learning_rate": 7.810966761934053e-05, | |
| "loss": 0.0405, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.1535433070866141, | |
| "grad_norm": 0.014100627042353153, | |
| "learning_rate": 7.558394309716088e-05, | |
| "loss": 0.0422, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.169291338582677, | |
| "grad_norm": 0.01578613370656967, | |
| "learning_rate": 7.307467669163655e-05, | |
| "loss": 0.0411, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.1850393700787403, | |
| "grad_norm": 0.013604246079921722, | |
| "learning_rate": 7.058355982245037e-05, | |
| "loss": 0.0373, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.2007874015748032, | |
| "grad_norm": 0.016308438032865524, | |
| "learning_rate": 6.811227167523815e-05, | |
| "loss": 0.0472, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.2165354330708662, | |
| "grad_norm": 0.014247502200305462, | |
| "learning_rate": 6.566247806970119e-05, | |
| "loss": 0.0464, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.2322834645669292, | |
| "grad_norm": 0.012891258113086224, | |
| "learning_rate": 6.323583033672799e-05, | |
| "loss": 0.0366, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2322834645669292, | |
| "eval_loss": 0.048343077301979065, | |
| "eval_runtime": 64.7735, | |
| "eval_samples_per_second": 7.781, | |
| "eval_steps_per_second": 0.973, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2480314960629921, | |
| "grad_norm": 0.015204845927655697, | |
| "learning_rate": 6.083396420528298e-05, | |
| "loss": 0.0438, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.263779527559055, | |
| "grad_norm": 0.01763073354959488, | |
| "learning_rate": 5.845849869981137e-05, | |
| "loss": 0.0466, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.279527559055118, | |
| "grad_norm": 0.013175925239920616, | |
| "learning_rate": 5.611103504890444e-05, | |
| "loss": 0.039, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.295275590551181, | |
| "grad_norm": 0.016102107241749763, | |
| "learning_rate": 5.379315560596038e-05, | |
| "loss": 0.0462, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.311023622047244, | |
| "grad_norm": 0.014480439946055412, | |
| "learning_rate": 5.1506422782568345e-05, | |
| "loss": 0.0402, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.326771653543307, | |
| "grad_norm": 0.017164282500743866, | |
| "learning_rate": 4.9252377995334444e-05, | |
| "loss": 0.0418, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.3425196850393701, | |
| "grad_norm": 0.013455789536237717, | |
| "learning_rate": 4.703254062686017e-05, | |
| "loss": 0.0402, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.358267716535433, | |
| "grad_norm": 0.014540264382958412, | |
| "learning_rate": 4.484840700157295e-05, | |
| "loss": 0.038, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.374015748031496, | |
| "grad_norm": 0.014430800452828407, | |
| "learning_rate": 4.270144937709981e-05, | |
| "loss": 0.0393, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.389763779527559, | |
| "grad_norm": 0.013658607378602028, | |
| "learning_rate": 4.059311495186338e-05, | |
| "loss": 0.0354, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.405511811023622, | |
| "grad_norm": 0.01640120893716812, | |
| "learning_rate": 3.852482488956992e-05, | |
| "loss": 0.0446, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.421259842519685, | |
| "grad_norm": 0.013601432554423809, | |
| "learning_rate": 3.649797336124615e-05, | |
| "loss": 0.035, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.4370078740157481, | |
| "grad_norm": 0.016174251213669777, | |
| "learning_rate": 3.45139266054715e-05, | |
| "loss": 0.0432, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.452755905511811, | |
| "grad_norm": 0.01637461967766285, | |
| "learning_rate": 3.257402200743821e-05, | |
| "loss": 0.0445, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.468503937007874, | |
| "grad_norm": 0.0154279675334692, | |
| "learning_rate": 3.0679567197461134e-05, | |
| "loss": 0.0433, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.484251968503937, | |
| "grad_norm": 0.013604864478111267, | |
| "learning_rate": 2.8831839169543996e-05, | |
| "loss": 0.0408, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.484251968503937, | |
| "eval_loss": 0.047206979244947433, | |
| "eval_runtime": 64.4089, | |
| "eval_samples_per_second": 7.825, | |
| "eval_steps_per_second": 0.978, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.014560838229954243, | |
| "learning_rate": 2.7032083420597e-05, | |
| "loss": 0.0385, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.515748031496063, | |
| "grad_norm": 0.01328711025416851, | |
| "learning_rate": 2.528151311088537e-05, | |
| "loss": 0.0397, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.531496062992126, | |
| "grad_norm": 0.016683636233210564, | |
| "learning_rate": 2.3581308246275103e-05, | |
| "loss": 0.0398, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.547244094488189, | |
| "grad_norm": 0.012160832062363625, | |
| "learning_rate": 2.1932614882827197e-05, | |
| "loss": 0.0328, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.5629921259842519, | |
| "grad_norm": 0.013753566890954971, | |
| "learning_rate": 2.03365443542764e-05, | |
| "loss": 0.0392, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.5787401574803148, | |
| "grad_norm": 0.013317620381712914, | |
| "learning_rate": 1.879417252291502e-05, | |
| "loss": 0.0346, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.594488188976378, | |
| "grad_norm": 0.018083734437823296, | |
| "learning_rate": 1.730653905438714e-05, | |
| "loss": 0.0482, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.610236220472441, | |
| "grad_norm": 0.015288034453988075, | |
| "learning_rate": 1.587464671688187e-05, | |
| "loss": 0.0416, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.625984251968504, | |
| "grad_norm": 0.01392639335244894, | |
| "learning_rate": 1.4499460705197998e-05, | |
| "loss": 0.0355, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.641732283464567, | |
| "grad_norm": 0.014464518055319786, | |
| "learning_rate": 1.3181907990135622e-05, | |
| "loss": 0.0378, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.65748031496063, | |
| "grad_norm": 0.014780817553400993, | |
| "learning_rate": 1.1922876693653585e-05, | |
| "loss": 0.0375, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.673228346456693, | |
| "grad_norm": 0.014019722118973732, | |
| "learning_rate": 1.0723215490213634e-05, | |
| "loss": 0.0371, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.688976377952756, | |
| "grad_norm": 0.013653130270540714, | |
| "learning_rate": 9.583733034714981e-06, | |
| "loss": 0.0341, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.704724409448819, | |
| "grad_norm": 0.014543344266712666, | |
| "learning_rate": 8.505197417404687e-06, | |
| "loss": 0.0363, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.720472440944882, | |
| "grad_norm": 0.01664627157151699, | |
| "learning_rate": 7.488335646131628e-06, | |
| "loss": 0.0397, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.736220472440945, | |
| "grad_norm": 0.01318218931555748, | |
| "learning_rate": 6.533833156292679e-06, | |
| "loss": 0.0363, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.736220472440945, | |
| "eval_loss": 0.04670024663209915, | |
| "eval_runtime": 64.4696, | |
| "eval_samples_per_second": 7.818, | |
| "eval_steps_per_second": 0.977, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.7519685039370079, | |
| "grad_norm": 0.014432979747653008, | |
| "learning_rate": 5.6423333488018095e-06, | |
| "loss": 0.0384, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.7677165354330708, | |
| "grad_norm": 0.013415982015430927, | |
| "learning_rate": 4.8144371563930476e-06, | |
| "loss": 0.0383, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.7834645669291338, | |
| "grad_norm": 0.01275411993265152, | |
| "learning_rate": 4.050702638550275e-06, | |
| "loss": 0.0344, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.7992125984251968, | |
| "grad_norm": 0.012640634551644325, | |
| "learning_rate": 3.3516446053363015e-06, | |
| "loss": 0.0325, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.8149606299212597, | |
| "grad_norm": 0.014171491377055645, | |
| "learning_rate": 2.717734270375272e-06, | |
| "loss": 0.0375, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.8307086614173227, | |
| "grad_norm": 0.014255956746637821, | |
| "learning_rate": 2.1493989332218468e-06, | |
| "loss": 0.0338, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.8464566929133859, | |
| "grad_norm": 0.015443972311913967, | |
| "learning_rate": 1.6470216913317626e-06, | |
| "loss": 0.0395, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.8622047244094488, | |
| "grad_norm": 0.015347709879279137, | |
| "learning_rate": 1.2109411818274852e-06, | |
| "loss": 0.0412, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.8779527559055118, | |
| "grad_norm": 0.011879626661539078, | |
| "learning_rate": 8.41451353233369e-07, | |
| "loss": 0.0353, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.8937007874015748, | |
| "grad_norm": 0.013861949555575848, | |
| "learning_rate": 5.388012673338661e-07, | |
| "loss": 0.0385, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.909448818897638, | |
| "grad_norm": 0.013466684147715569, | |
| "learning_rate": 3.0319493128866396e-07, | |
| "loss": 0.0392, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.925196850393701, | |
| "grad_norm": 0.014232831075787544, | |
| "learning_rate": 1.3479116011769767e-07, | |
| "loss": 0.0387, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.9409448818897639, | |
| "grad_norm": 0.013951584696769714, | |
| "learning_rate": 3.370346964876036e-08, | |
| "loss": 0.039, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.9566929133858268, | |
| "grad_norm": 0.014759634621441364, | |
| "learning_rate": 0.0, | |
| "loss": 0.04, | |
| "step": 126 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 126, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 63, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.624893472111329e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |