{ "best_metric": 0.05946353077888489, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.42317380352644834, "eval_steps": 100, "global_step": 462, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009159606136936112, "grad_norm": 5.519550323486328, "learning_rate": 2e-05, "loss": 4.8758, "step": 1 }, { "epoch": 0.0009159606136936112, "eval_loss": 0.6018432974815369, "eval_runtime": 431.5799, "eval_samples_per_second": 3.374, "eval_steps_per_second": 0.843, "step": 1 }, { "epoch": 0.0018319212273872224, "grad_norm": 5.216701507568359, "learning_rate": 4e-05, "loss": 5.0302, "step": 2 }, { "epoch": 0.0027478818410808336, "grad_norm": 15.041247367858887, "learning_rate": 6e-05, "loss": 5.6025, "step": 3 }, { "epoch": 0.0036638424547744447, "grad_norm": 4.073129177093506, "learning_rate": 8e-05, "loss": 4.5046, "step": 4 }, { "epoch": 0.004579803068468056, "grad_norm": 3.793393850326538, "learning_rate": 0.0001, "loss": 3.7209, "step": 5 }, { "epoch": 0.005495763682161667, "grad_norm": 3.9871528148651123, "learning_rate": 0.00012, "loss": 2.6306, "step": 6 }, { "epoch": 0.006411724295855278, "grad_norm": 1.6556649208068848, "learning_rate": 0.00014, "loss": 1.8335, "step": 7 }, { "epoch": 0.0073276849095488894, "grad_norm": 2.6266279220581055, "learning_rate": 0.00016, "loss": 1.5011, "step": 8 }, { "epoch": 0.0082436455232425, "grad_norm": 2.764836311340332, "learning_rate": 0.00018, "loss": 1.3103, "step": 9 }, { "epoch": 0.009159606136936112, "grad_norm": 3.4879281520843506, "learning_rate": 0.0002, "loss": 1.3028, "step": 10 }, { "epoch": 0.010075566750629723, "grad_norm": 2.448042869567871, "learning_rate": 0.00019999758458848847, "loss": 1.3211, "step": 11 }, { "epoch": 0.010991527364323335, "grad_norm": 1.9464507102966309, "learning_rate": 0.00019999033847063811, "loss": 1.2399, "step": 12 }, { "epoch": 0.011907487978016945, "grad_norm": 1.6776124238967896, "learning_rate": 0.00019997826199649605, "loss": 1.0416, "step": 13 }, { "epoch": 0.012823448591710557, "grad_norm": 1.573235273361206, "learning_rate": 0.00019996135574945544, "loss": 1.3556, "step": 14 }, { "epoch": 0.013739409205404167, "grad_norm": 1.5061469078063965, "learning_rate": 0.00019993962054622703, "loss": 1.2534, "step": 15 }, { "epoch": 0.014655369819097779, "grad_norm": 1.4490333795547485, "learning_rate": 0.00019991305743680013, "loss": 0.8765, "step": 16 }, { "epoch": 0.01557133043279139, "grad_norm": 1.4613972902297974, "learning_rate": 0.00019988166770439154, "loss": 1.0973, "step": 17 }, { "epoch": 0.016487291046485, "grad_norm": 1.417468547821045, "learning_rate": 0.0001998454528653836, "loss": 1.2705, "step": 18 }, { "epoch": 0.01740325166017861, "grad_norm": 1.23396897315979, "learning_rate": 0.00019980441466925118, "loss": 1.007, "step": 19 }, { "epoch": 0.018319212273872225, "grad_norm": 1.4013880491256714, "learning_rate": 0.00019975855509847686, "loss": 0.8394, "step": 20 }, { "epoch": 0.019235172887565835, "grad_norm": 1.7472220659255981, "learning_rate": 0.00019970787636845535, "loss": 1.1364, "step": 21 }, { "epoch": 0.020151133501259445, "grad_norm": 1.3513895273208618, "learning_rate": 0.00019965238092738643, "loss": 0.9134, "step": 22 }, { "epoch": 0.021067094114953056, "grad_norm": 1.5269527435302734, "learning_rate": 0.00019959207145615665, "loss": 0.8255, "step": 23 }, { "epoch": 0.02198305472864667, "grad_norm": 0.8268145322799683, "learning_rate": 0.00019952695086820975, "loss": 0.6196, "step": 24 }, { "epoch": 0.02289901534234028, "grad_norm": 1.1546183824539185, "learning_rate": 0.00019945702230940614, "loss": 0.9068, "step": 25 }, { "epoch": 0.02381497595603389, "grad_norm": 0.8031617999076843, "learning_rate": 0.0001993822891578708, "loss": 0.5674, "step": 26 }, { "epoch": 0.024730936569727503, "grad_norm": 0.937088668346405, "learning_rate": 0.0001993027550238299, "loss": 0.7044, "step": 27 }, { "epoch": 0.025646897183421113, "grad_norm": 1.08357834815979, "learning_rate": 0.0001992184237494368, "loss": 0.9674, "step": 28 }, { "epoch": 0.026562857797114724, "grad_norm": 1.0530093908309937, "learning_rate": 0.00019912929940858607, "loss": 0.7554, "step": 29 }, { "epoch": 0.027478818410808334, "grad_norm": 1.334518551826477, "learning_rate": 0.0001990353863067169, "loss": 0.746, "step": 30 }, { "epoch": 0.028394779024501948, "grad_norm": 1.6642454862594604, "learning_rate": 0.00019893668898060502, "loss": 1.189, "step": 31 }, { "epoch": 0.029310739638195558, "grad_norm": 1.232517957687378, "learning_rate": 0.0001988332121981436, "loss": 1.0104, "step": 32 }, { "epoch": 0.030226700251889168, "grad_norm": 0.9858556389808655, "learning_rate": 0.00019872496095811286, "loss": 0.6634, "step": 33 }, { "epoch": 0.03114266086558278, "grad_norm": 1.123953104019165, "learning_rate": 0.00019861194048993863, "loss": 0.8598, "step": 34 }, { "epoch": 0.03205862147927639, "grad_norm": 1.0000923871994019, "learning_rate": 0.0001984941562534397, "loss": 0.8872, "step": 35 }, { "epoch": 0.03297458209297, "grad_norm": 1.1624726057052612, "learning_rate": 0.0001983716139385641, "loss": 0.8707, "step": 36 }, { "epoch": 0.03389054270666361, "grad_norm": 1.0879074335098267, "learning_rate": 0.0001982443194651142, "loss": 0.8361, "step": 37 }, { "epoch": 0.03480650332035722, "grad_norm": 1.3374804258346558, "learning_rate": 0.0001981122789824607, "loss": 0.7765, "step": 38 }, { "epoch": 0.03572246393405083, "grad_norm": 0.8522550463676453, "learning_rate": 0.00019797549886924566, "loss": 0.8699, "step": 39 }, { "epoch": 0.03663842454774445, "grad_norm": 0.8720804452896118, "learning_rate": 0.00019783398573307428, "loss": 0.6595, "step": 40 }, { "epoch": 0.03755438516143806, "grad_norm": 1.0943487882614136, "learning_rate": 0.0001976877464101957, "loss": 0.8158, "step": 41 }, { "epoch": 0.03847034577513167, "grad_norm": 1.0142332315444946, "learning_rate": 0.00019753678796517282, "loss": 0.867, "step": 42 }, { "epoch": 0.03938630638882528, "grad_norm": 0.9329423904418945, "learning_rate": 0.00019738111769054093, "loss": 0.7283, "step": 43 }, { "epoch": 0.04030226700251889, "grad_norm": 0.7856294512748718, "learning_rate": 0.00019722074310645553, "loss": 0.6204, "step": 44 }, { "epoch": 0.0412182276162125, "grad_norm": 1.2970919609069824, "learning_rate": 0.00019705567196032892, "loss": 0.6546, "step": 45 }, { "epoch": 0.04213418822990611, "grad_norm": 1.1588696241378784, "learning_rate": 0.00019688591222645607, "loss": 0.8384, "step": 46 }, { "epoch": 0.04305014884359973, "grad_norm": 0.9822084903717041, "learning_rate": 0.00019671147210562927, "loss": 0.6732, "step": 47 }, { "epoch": 0.04396610945729334, "grad_norm": 0.8491467237472534, "learning_rate": 0.000196532360024742, "loss": 0.6257, "step": 48 }, { "epoch": 0.04488207007098695, "grad_norm": 1.0204273462295532, "learning_rate": 0.000196348584636382, "loss": 0.7814, "step": 49 }, { "epoch": 0.04579803068468056, "grad_norm": 1.1780798435211182, "learning_rate": 0.0001961601548184129, "loss": 0.6859, "step": 50 }, { "epoch": 0.04671399129837417, "grad_norm": 0.9798137545585632, "learning_rate": 0.00019596707967354585, "loss": 0.6813, "step": 51 }, { "epoch": 0.04762995191206778, "grad_norm": 1.10402512550354, "learning_rate": 0.00019576936852889936, "loss": 0.6647, "step": 52 }, { "epoch": 0.04854591252576139, "grad_norm": 1.1076951026916504, "learning_rate": 0.0001955670309355489, "loss": 0.7283, "step": 53 }, { "epoch": 0.049461873139455007, "grad_norm": 0.946972131729126, "learning_rate": 0.00019536007666806556, "loss": 0.6674, "step": 54 }, { "epoch": 0.05037783375314862, "grad_norm": 0.8705822825431824, "learning_rate": 0.00019514851572404368, "loss": 0.5044, "step": 55 }, { "epoch": 0.05129379436684223, "grad_norm": 0.8742169737815857, "learning_rate": 0.0001949323583236181, "loss": 0.7942, "step": 56 }, { "epoch": 0.05220975498053584, "grad_norm": 1.487743616104126, "learning_rate": 0.00019471161490897029, "loss": 0.5322, "step": 57 }, { "epoch": 0.05312571559422945, "grad_norm": 0.8018708825111389, "learning_rate": 0.0001944862961438239, "loss": 0.7712, "step": 58 }, { "epoch": 0.05404167620792306, "grad_norm": 0.9287647008895874, "learning_rate": 0.00019425641291292978, "loss": 0.6638, "step": 59 }, { "epoch": 0.05495763682161667, "grad_norm": 1.1208561658859253, "learning_rate": 0.00019402197632153992, "loss": 0.6888, "step": 60 }, { "epoch": 0.055873597435310285, "grad_norm": 0.8650437593460083, "learning_rate": 0.00019378299769487117, "loss": 0.602, "step": 61 }, { "epoch": 0.056789558049003895, "grad_norm": 0.9438541531562805, "learning_rate": 0.00019353948857755803, "loss": 0.8007, "step": 62 }, { "epoch": 0.057705518662697505, "grad_norm": 1.0785539150238037, "learning_rate": 0.00019329146073309504, "loss": 0.6443, "step": 63 }, { "epoch": 0.058621479276391115, "grad_norm": 0.938920259475708, "learning_rate": 0.00019303892614326836, "loss": 0.5166, "step": 64 }, { "epoch": 0.059537439890084726, "grad_norm": 0.7306029796600342, "learning_rate": 0.00019278189700757715, "loss": 0.6121, "step": 65 }, { "epoch": 0.060453400503778336, "grad_norm": 0.6238380670547485, "learning_rate": 0.00019252038574264405, "loss": 0.6597, "step": 66 }, { "epoch": 0.061369361117471946, "grad_norm": 0.9852589964866638, "learning_rate": 0.00019225440498161546, "loss": 0.7409, "step": 67 }, { "epoch": 0.06228532173116556, "grad_norm": 0.8592156171798706, "learning_rate": 0.00019198396757355118, "loss": 0.7738, "step": 68 }, { "epoch": 0.06320128234485917, "grad_norm": 1.1071726083755493, "learning_rate": 0.00019170908658280386, "loss": 0.95, "step": 69 }, { "epoch": 0.06411724295855278, "grad_norm": 0.8783465623855591, "learning_rate": 0.00019142977528838762, "loss": 0.7653, "step": 70 }, { "epoch": 0.06503320357224639, "grad_norm": 1.0121217966079712, "learning_rate": 0.0001911460471833368, "loss": 0.9466, "step": 71 }, { "epoch": 0.06594916418594, "grad_norm": 0.8773440718650818, "learning_rate": 0.00019085791597405404, "loss": 0.7238, "step": 72 }, { "epoch": 0.06686512479963362, "grad_norm": 0.7618806958198547, "learning_rate": 0.00019056539557964813, "loss": 0.4984, "step": 73 }, { "epoch": 0.06778108541332722, "grad_norm": 0.9363157153129578, "learning_rate": 0.00019026850013126157, "loss": 0.7261, "step": 74 }, { "epoch": 0.06869704602702084, "grad_norm": 1.3099620342254639, "learning_rate": 0.00018996724397138813, "loss": 0.7283, "step": 75 }, { "epoch": 0.06961300664071444, "grad_norm": 0.882304847240448, "learning_rate": 0.00018966164165317966, "loss": 0.5816, "step": 76 }, { "epoch": 0.07052896725440806, "grad_norm": 0.8315114378929138, "learning_rate": 0.00018935170793974335, "loss": 0.7533, "step": 77 }, { "epoch": 0.07144492786810167, "grad_norm": 0.8620697259902954, "learning_rate": 0.00018903745780342839, "loss": 0.6984, "step": 78 }, { "epoch": 0.07236088848179528, "grad_norm": 0.784542977809906, "learning_rate": 0.0001887189064251027, "loss": 0.5505, "step": 79 }, { "epoch": 0.0732768490954889, "grad_norm": 0.7392190098762512, "learning_rate": 0.0001883960691934196, "loss": 0.6052, "step": 80 }, { "epoch": 0.0741928097091825, "grad_norm": 0.810517430305481, "learning_rate": 0.00018806896170407437, "loss": 0.5008, "step": 81 }, { "epoch": 0.07510877032287612, "grad_norm": 0.6403000950813293, "learning_rate": 0.00018773759975905098, "loss": 0.3958, "step": 82 }, { "epoch": 0.07602473093656972, "grad_norm": 0.6503390073776245, "learning_rate": 0.00018740199936585853, "loss": 0.552, "step": 83 }, { "epoch": 0.07694069155026334, "grad_norm": 0.9502889513969421, "learning_rate": 0.00018706217673675811, "loss": 0.5942, "step": 84 }, { "epoch": 0.07785665216395694, "grad_norm": 1.326459527015686, "learning_rate": 0.0001867181482879795, "loss": 0.7382, "step": 85 }, { "epoch": 0.07877261277765056, "grad_norm": 0.7194717526435852, "learning_rate": 0.0001863699306389282, "loss": 0.584, "step": 86 }, { "epoch": 0.07968857339134418, "grad_norm": 0.8091100454330444, "learning_rate": 0.00018601754061138256, "loss": 0.5447, "step": 87 }, { "epoch": 0.08060453400503778, "grad_norm": 0.9021152853965759, "learning_rate": 0.00018566099522868119, "loss": 0.7408, "step": 88 }, { "epoch": 0.0815204946187314, "grad_norm": 1.156087040901184, "learning_rate": 0.00018530031171490053, "loss": 0.763, "step": 89 }, { "epoch": 0.082436455232425, "grad_norm": 1.2682101726531982, "learning_rate": 0.00018493550749402278, "loss": 0.7645, "step": 90 }, { "epoch": 0.08335241584611862, "grad_norm": 1.40992271900177, "learning_rate": 0.00018456660018909425, "loss": 0.9506, "step": 91 }, { "epoch": 0.08426837645981222, "grad_norm": 1.1955220699310303, "learning_rate": 0.00018419360762137395, "loss": 0.6177, "step": 92 }, { "epoch": 0.08518433707350584, "grad_norm": 1.3912885189056396, "learning_rate": 0.0001838165478094727, "loss": 0.8448, "step": 93 }, { "epoch": 0.08610029768719946, "grad_norm": 0.8000448346138, "learning_rate": 0.00018343543896848273, "loss": 0.5822, "step": 94 }, { "epoch": 0.08701625830089306, "grad_norm": 0.8547763228416443, "learning_rate": 0.00018305029950909768, "loss": 0.598, "step": 95 }, { "epoch": 0.08793221891458668, "grad_norm": 0.8347606062889099, "learning_rate": 0.00018266114803672318, "loss": 0.4001, "step": 96 }, { "epoch": 0.08884817952828028, "grad_norm": 0.5576004385948181, "learning_rate": 0.00018226800335057822, "loss": 0.3399, "step": 97 }, { "epoch": 0.0897641401419739, "grad_norm": 0.7549009323120117, "learning_rate": 0.00018187088444278674, "loss": 0.6359, "step": 98 }, { "epoch": 0.0906801007556675, "grad_norm": 0.7920951247215271, "learning_rate": 0.00018146981049746043, "loss": 0.6335, "step": 99 }, { "epoch": 0.09159606136936112, "grad_norm": 1.2786948680877686, "learning_rate": 0.00018106480088977172, "loss": 0.6916, "step": 100 }, { "epoch": 0.09159606136936112, "eval_loss": 0.0789576843380928, "eval_runtime": 436.0546, "eval_samples_per_second": 3.339, "eval_steps_per_second": 0.835, "step": 100 }, { "epoch": 0.09251202198305473, "grad_norm": 0.7070974111557007, "learning_rate": 0.00018065587518501804, "loss": 0.58, "step": 101 }, { "epoch": 0.09342798259674834, "grad_norm": 0.6558282971382141, "learning_rate": 0.00018024305313767646, "loss": 0.5625, "step": 102 }, { "epoch": 0.09434394321044196, "grad_norm": 0.8452802896499634, "learning_rate": 0.0001798263546904495, "loss": 0.9307, "step": 103 }, { "epoch": 0.09525990382413556, "grad_norm": 0.8087717294692993, "learning_rate": 0.00017940579997330165, "loss": 0.5895, "step": 104 }, { "epoch": 0.09617586443782918, "grad_norm": 1.0061838626861572, "learning_rate": 0.00017898140930248704, "loss": 0.5726, "step": 105 }, { "epoch": 0.09709182505152278, "grad_norm": 0.8311317563056946, "learning_rate": 0.00017855320317956784, "loss": 0.5909, "step": 106 }, { "epoch": 0.0980077856652164, "grad_norm": 0.8104123473167419, "learning_rate": 0.00017812120229042416, "loss": 0.4301, "step": 107 }, { "epoch": 0.09892374627891001, "grad_norm": 0.8177468776702881, "learning_rate": 0.00017768542750425426, "loss": 0.4303, "step": 108 }, { "epoch": 0.09983970689260362, "grad_norm": 2.4188196659088135, "learning_rate": 0.00017724589987256698, "loss": 0.5251, "step": 109 }, { "epoch": 0.10075566750629723, "grad_norm": 1.1067713499069214, "learning_rate": 0.0001768026406281642, "loss": 0.5276, "step": 110 }, { "epoch": 0.10167162811999084, "grad_norm": 0.7167340517044067, "learning_rate": 0.0001763556711841157, "loss": 0.5237, "step": 111 }, { "epoch": 0.10258758873368445, "grad_norm": 0.9675881862640381, "learning_rate": 0.00017590501313272415, "loss": 0.776, "step": 112 }, { "epoch": 0.10350354934737806, "grad_norm": 0.8013266324996948, "learning_rate": 0.00017545068824448255, "loss": 0.6585, "step": 113 }, { "epoch": 0.10441950996107167, "grad_norm": 0.9880797863006592, "learning_rate": 0.00017499271846702213, "loss": 0.7176, "step": 114 }, { "epoch": 0.10533547057476529, "grad_norm": 0.7393934726715088, "learning_rate": 0.00017453112592405242, "loss": 0.5301, "step": 115 }, { "epoch": 0.1062514311884589, "grad_norm": 0.773297905921936, "learning_rate": 0.00017406593291429217, "loss": 0.8689, "step": 116 }, { "epoch": 0.10716739180215251, "grad_norm": 0.8815104961395264, "learning_rate": 0.00017359716191039248, "loss": 0.6419, "step": 117 }, { "epoch": 0.10808335241584612, "grad_norm": 0.825872540473938, "learning_rate": 0.00017312483555785086, "loss": 0.5714, "step": 118 }, { "epoch": 0.10899931302953973, "grad_norm": 0.5747569799423218, "learning_rate": 0.00017264897667391754, "loss": 0.4421, "step": 119 }, { "epoch": 0.10991527364323334, "grad_norm": 1.2853175401687622, "learning_rate": 0.00017216960824649303, "loss": 0.6273, "step": 120 }, { "epoch": 0.11083123425692695, "grad_norm": 0.6967117190361023, "learning_rate": 0.00017168675343301769, "loss": 0.547, "step": 121 }, { "epoch": 0.11174719487062057, "grad_norm": 1.197424054145813, "learning_rate": 0.00017120043555935298, "loss": 0.7641, "step": 122 }, { "epoch": 0.11266315548431417, "grad_norm": 1.1903916597366333, "learning_rate": 0.00017071067811865476, "loss": 0.8993, "step": 123 }, { "epoch": 0.11357911609800779, "grad_norm": 1.3580857515335083, "learning_rate": 0.0001702175047702382, "loss": 0.6937, "step": 124 }, { "epoch": 0.1144950767117014, "grad_norm": 0.8347879648208618, "learning_rate": 0.000169720939338435, "loss": 0.5253, "step": 125 }, { "epoch": 0.11541103732539501, "grad_norm": 0.8995674252510071, "learning_rate": 0.00016922100581144228, "loss": 0.8104, "step": 126 }, { "epoch": 0.11632699793908861, "grad_norm": 0.7819614410400391, "learning_rate": 0.00016871772834016406, "loss": 0.6911, "step": 127 }, { "epoch": 0.11724295855278223, "grad_norm": 0.9112820029258728, "learning_rate": 0.00016821113123704424, "loss": 0.4639, "step": 128 }, { "epoch": 0.11815891916647585, "grad_norm": 0.8054628372192383, "learning_rate": 0.00016770123897489228, "loss": 0.7766, "step": 129 }, { "epoch": 0.11907487978016945, "grad_norm": 0.7405731081962585, "learning_rate": 0.00016718807618570106, "loss": 0.5029, "step": 130 }, { "epoch": 0.11999084039386307, "grad_norm": 0.5705666542053223, "learning_rate": 0.00016667166765945668, "loss": 0.3547, "step": 131 }, { "epoch": 0.12090680100755667, "grad_norm": 0.5570306777954102, "learning_rate": 0.00016615203834294119, "loss": 0.4366, "step": 132 }, { "epoch": 0.12182276162125029, "grad_norm": 0.7209002375602722, "learning_rate": 0.00016562921333852714, "loss": 0.6965, "step": 133 }, { "epoch": 0.12273872223494389, "grad_norm": 0.8611632585525513, "learning_rate": 0.00016510321790296525, "loss": 0.7107, "step": 134 }, { "epoch": 0.12365468284863751, "grad_norm": 1.1321237087249756, "learning_rate": 0.0001645740774461642, "loss": 0.6989, "step": 135 }, { "epoch": 0.12457064346233113, "grad_norm": 0.8014971017837524, "learning_rate": 0.00016404181752996289, "loss": 0.6939, "step": 136 }, { "epoch": 0.12548660407602474, "grad_norm": 0.8466368913650513, "learning_rate": 0.00016350646386689593, "loss": 0.501, "step": 137 }, { "epoch": 0.12640256468971833, "grad_norm": 0.9968240857124329, "learning_rate": 0.00016296804231895142, "loss": 0.5624, "step": 138 }, { "epoch": 0.12731852530341195, "grad_norm": 1.1174370050430298, "learning_rate": 0.00016242657889632133, "loss": 0.8239, "step": 139 }, { "epoch": 0.12823448591710557, "grad_norm": 1.014504075050354, "learning_rate": 0.00016188209975614542, "loss": 0.7467, "step": 140 }, { "epoch": 0.12915044653079918, "grad_norm": 0.8318601846694946, "learning_rate": 0.00016133463120124731, "loss": 0.6325, "step": 141 }, { "epoch": 0.13006640714449277, "grad_norm": 0.7232715487480164, "learning_rate": 0.00016078419967886402, "loss": 0.4998, "step": 142 }, { "epoch": 0.1309823677581864, "grad_norm": 0.49094462394714355, "learning_rate": 0.00016023083177936823, "loss": 0.4028, "step": 143 }, { "epoch": 0.13189832837188, "grad_norm": 0.7213066816329956, "learning_rate": 0.00015967455423498387, "loss": 0.548, "step": 144 }, { "epoch": 0.13281428898557363, "grad_norm": 1.2420037984848022, "learning_rate": 0.00015911539391849462, "loss": 0.5315, "step": 145 }, { "epoch": 0.13373024959926724, "grad_norm": 0.7429931163787842, "learning_rate": 0.00015855337784194577, "loss": 0.5053, "step": 146 }, { "epoch": 0.13464621021296083, "grad_norm": 0.6793218851089478, "learning_rate": 0.00015798853315533931, "loss": 0.4798, "step": 147 }, { "epoch": 0.13556217082665445, "grad_norm": 0.8552589416503906, "learning_rate": 0.00015742088714532247, "loss": 0.7091, "step": 148 }, { "epoch": 0.13647813144034807, "grad_norm": 1.0349805355072021, "learning_rate": 0.00015685046723386937, "loss": 0.799, "step": 149 }, { "epoch": 0.13739409205404168, "grad_norm": 0.7529215812683105, "learning_rate": 0.00015627730097695638, "loss": 0.6193, "step": 150 }, { "epoch": 0.1383100526677353, "grad_norm": 0.8655149340629578, "learning_rate": 0.00015570141606323105, "loss": 0.5076, "step": 151 }, { "epoch": 0.1392260132814289, "grad_norm": 0.724417507648468, "learning_rate": 0.00015512284031267437, "loss": 0.5132, "step": 152 }, { "epoch": 0.1401419738951225, "grad_norm": 0.724981427192688, "learning_rate": 0.00015454160167525685, "loss": 0.5337, "step": 153 }, { "epoch": 0.14105793450881612, "grad_norm": 1.0957812070846558, "learning_rate": 0.00015395772822958845, "loss": 0.6742, "step": 154 }, { "epoch": 0.14197389512250974, "grad_norm": 0.7567093372344971, "learning_rate": 0.00015337124818156205, "loss": 0.7095, "step": 155 }, { "epoch": 0.14288985573620333, "grad_norm": 0.891369640827179, "learning_rate": 0.00015278218986299074, "loss": 0.5988, "step": 156 }, { "epoch": 0.14380581634989695, "grad_norm": 0.7748335003852844, "learning_rate": 0.0001521905817302395, "loss": 0.6206, "step": 157 }, { "epoch": 0.14472177696359056, "grad_norm": 0.7905634641647339, "learning_rate": 0.0001515964523628501, "loss": 0.5326, "step": 158 }, { "epoch": 0.14563773757728418, "grad_norm": 0.9001010060310364, "learning_rate": 0.0001509998304621609, "loss": 0.6336, "step": 159 }, { "epoch": 0.1465536981909778, "grad_norm": 0.8940883278846741, "learning_rate": 0.00015040074484992, "loss": 0.6034, "step": 160 }, { "epoch": 0.1474696588046714, "grad_norm": 1.0931828022003174, "learning_rate": 0.00014979922446689306, "loss": 0.7144, "step": 161 }, { "epoch": 0.148385619418365, "grad_norm": 0.7018395662307739, "learning_rate": 0.00014919529837146528, "loss": 0.4174, "step": 162 }, { "epoch": 0.14930158003205862, "grad_norm": 0.6879329085350037, "learning_rate": 0.00014858899573823753, "loss": 0.3973, "step": 163 }, { "epoch": 0.15021754064575224, "grad_norm": 0.7063140869140625, "learning_rate": 0.00014798034585661695, "loss": 0.5576, "step": 164 }, { "epoch": 0.15113350125944586, "grad_norm": 0.5726875066757202, "learning_rate": 0.00014736937812940217, "loss": 0.464, "step": 165 }, { "epoch": 0.15204946187313945, "grad_norm": 0.6210931539535522, "learning_rate": 0.0001467561220713628, "loss": 0.431, "step": 166 }, { "epoch": 0.15296542248683306, "grad_norm": 0.855782687664032, "learning_rate": 0.00014614060730781377, "loss": 0.5312, "step": 167 }, { "epoch": 0.15388138310052668, "grad_norm": 0.6990883350372314, "learning_rate": 0.0001455228635731839, "loss": 0.5557, "step": 168 }, { "epoch": 0.1547973437142203, "grad_norm": 0.6450179219245911, "learning_rate": 0.0001449029207095798, "loss": 0.4738, "step": 169 }, { "epoch": 0.1557133043279139, "grad_norm": 0.8470801711082458, "learning_rate": 0.00014428080866534396, "loss": 0.8791, "step": 170 }, { "epoch": 0.1566292649416075, "grad_norm": 0.8217272758483887, "learning_rate": 0.00014365655749360833, "loss": 0.7099, "step": 171 }, { "epoch": 0.15754522555530112, "grad_norm": 0.8225908875465393, "learning_rate": 0.00014303019735084226, "loss": 0.9925, "step": 172 }, { "epoch": 0.15846118616899474, "grad_norm": 0.8007174134254456, "learning_rate": 0.00014240175849539565, "loss": 0.5416, "step": 173 }, { "epoch": 0.15937714678268836, "grad_norm": 0.8009291291236877, "learning_rate": 0.00014177127128603745, "loss": 0.8397, "step": 174 }, { "epoch": 0.16029310739638195, "grad_norm": 0.5703025460243225, "learning_rate": 0.00014113876618048897, "loss": 0.5596, "step": 175 }, { "epoch": 0.16120906801007556, "grad_norm": 0.824314296245575, "learning_rate": 0.0001405042737339524, "loss": 0.5353, "step": 176 }, { "epoch": 0.16212502862376918, "grad_norm": 0.7707583904266357, "learning_rate": 0.000139867824597635, "loss": 0.5362, "step": 177 }, { "epoch": 0.1630409892374628, "grad_norm": 0.7862932682037354, "learning_rate": 0.0001392294495172681, "loss": 0.5944, "step": 178 }, { "epoch": 0.1639569498511564, "grad_norm": 0.6190929412841797, "learning_rate": 0.0001385891793316221, "loss": 0.764, "step": 179 }, { "epoch": 0.16487291046485, "grad_norm": 0.457586407661438, "learning_rate": 0.00013794704497101655, "loss": 0.4105, "step": 180 }, { "epoch": 0.16578887107854362, "grad_norm": 0.4811520278453827, "learning_rate": 0.00013730307745582593, "loss": 0.358, "step": 181 }, { "epoch": 0.16670483169223724, "grad_norm": 0.790302574634552, "learning_rate": 0.0001366573078949813, "loss": 0.5677, "step": 182 }, { "epoch": 0.16762079230593085, "grad_norm": 0.8360859155654907, "learning_rate": 0.0001360097674844672, "loss": 0.4774, "step": 183 }, { "epoch": 0.16853675291962444, "grad_norm": 0.6449018716812134, "learning_rate": 0.00013536048750581494, "loss": 0.5234, "step": 184 }, { "epoch": 0.16945271353331806, "grad_norm": 0.7947589755058289, "learning_rate": 0.00013470949932459117, "loss": 0.4427, "step": 185 }, { "epoch": 0.17036867414701168, "grad_norm": 0.5645202994346619, "learning_rate": 0.00013405683438888282, "loss": 0.4278, "step": 186 }, { "epoch": 0.1712846347607053, "grad_norm": 0.6219568848609924, "learning_rate": 0.00013340252422777788, "loss": 0.3971, "step": 187 }, { "epoch": 0.1722005953743989, "grad_norm": 0.6408222317695618, "learning_rate": 0.00013274660044984224, "loss": 0.5036, "step": 188 }, { "epoch": 0.1731165559880925, "grad_norm": 0.7281380891799927, "learning_rate": 0.0001320890947415928, "loss": 0.5588, "step": 189 }, { "epoch": 0.17403251660178612, "grad_norm": 0.5523554682731628, "learning_rate": 0.00013143003886596669, "loss": 0.4022, "step": 190 }, { "epoch": 0.17494847721547974, "grad_norm": 0.923069179058075, "learning_rate": 0.0001307694646607869, "loss": 0.5009, "step": 191 }, { "epoch": 0.17586443782917335, "grad_norm": 0.5798431038856506, "learning_rate": 0.0001301074040372242, "loss": 0.4506, "step": 192 }, { "epoch": 0.17678039844286697, "grad_norm": 0.6347712874412537, "learning_rate": 0.0001294438889782556, "loss": 0.4864, "step": 193 }, { "epoch": 0.17769635905656056, "grad_norm": 0.7514728307723999, "learning_rate": 0.00012877895153711935, "loss": 0.5996, "step": 194 }, { "epoch": 0.17861231967025418, "grad_norm": 0.9295995235443115, "learning_rate": 0.00012811262383576646, "loss": 0.6651, "step": 195 }, { "epoch": 0.1795282802839478, "grad_norm": 0.8588167428970337, "learning_rate": 0.0001274449380633089, "loss": 0.6871, "step": 196 }, { "epoch": 0.1804442408976414, "grad_norm": 0.6991099119186401, "learning_rate": 0.00012677592647446472, "loss": 0.5572, "step": 197 }, { "epoch": 0.181360201511335, "grad_norm": 0.7341340780258179, "learning_rate": 0.00012610562138799978, "loss": 0.5595, "step": 198 }, { "epoch": 0.18227616212502862, "grad_norm": 0.8957074284553528, "learning_rate": 0.0001254340551851665, "loss": 0.573, "step": 199 }, { "epoch": 0.18319212273872224, "grad_norm": 0.9093302488327026, "learning_rate": 0.00012476126030813963, "loss": 0.7076, "step": 200 }, { "epoch": 0.18319212273872224, "eval_loss": 0.06856601685285568, "eval_runtime": 436.2246, "eval_samples_per_second": 3.338, "eval_steps_per_second": 0.834, "step": 200 }, { "epoch": 0.18410808335241585, "grad_norm": 0.9715799689292908, "learning_rate": 0.000124087269258449, "loss": 0.4213, "step": 201 }, { "epoch": 0.18502404396610947, "grad_norm": 0.9131537079811096, "learning_rate": 0.0001234121145954094, "loss": 0.7887, "step": 202 }, { "epoch": 0.18594000457980306, "grad_norm": 0.8268325924873352, "learning_rate": 0.00012273582893454775, "loss": 0.464, "step": 203 }, { "epoch": 0.18685596519349668, "grad_norm": 0.63968825340271, "learning_rate": 0.0001220584449460274, "loss": 0.5542, "step": 204 }, { "epoch": 0.1877719258071903, "grad_norm": 0.619226336479187, "learning_rate": 0.0001213799953530701, "loss": 0.5164, "step": 205 }, { "epoch": 0.1886878864208839, "grad_norm": 0.6572562456130981, "learning_rate": 0.00012070051293037492, "loss": 0.5105, "step": 206 }, { "epoch": 0.1896038470345775, "grad_norm": 0.8703852891921997, "learning_rate": 0.00012002003050253522, "loss": 0.724, "step": 207 }, { "epoch": 0.19051980764827112, "grad_norm": 0.6222299933433533, "learning_rate": 0.00011933858094245281, "loss": 0.4516, "step": 208 }, { "epoch": 0.19143576826196473, "grad_norm": 0.9056154489517212, "learning_rate": 0.00011865619716974984, "loss": 0.5116, "step": 209 }, { "epoch": 0.19235172887565835, "grad_norm": 0.5980194807052612, "learning_rate": 0.00011797291214917881, "loss": 0.4606, "step": 210 }, { "epoch": 0.19326768948935197, "grad_norm": 0.5795995593070984, "learning_rate": 0.00011728875888902975, "loss": 0.7419, "step": 211 }, { "epoch": 0.19418365010304556, "grad_norm": 0.8155220150947571, "learning_rate": 0.00011660377043953588, "loss": 0.5837, "step": 212 }, { "epoch": 0.19509961071673917, "grad_norm": 1.20220947265625, "learning_rate": 0.0001159179798912769, "loss": 0.5612, "step": 213 }, { "epoch": 0.1960155713304328, "grad_norm": 0.7634713649749756, "learning_rate": 0.0001152314203735805, "loss": 0.6243, "step": 214 }, { "epoch": 0.1969315319441264, "grad_norm": 0.5542187094688416, "learning_rate": 0.000114544125052922, "loss": 0.5042, "step": 215 }, { "epoch": 0.19784749255782003, "grad_norm": 0.8429003953933716, "learning_rate": 0.0001138561271313219, "loss": 0.6943, "step": 216 }, { "epoch": 0.19876345317151362, "grad_norm": 0.7061911225318909, "learning_rate": 0.00011316745984474226, "loss": 0.5509, "step": 217 }, { "epoch": 0.19967941378520723, "grad_norm": 1.0747299194335938, "learning_rate": 0.00011247815646148087, "loss": 0.4586, "step": 218 }, { "epoch": 0.20059537439890085, "grad_norm": 0.5371668338775635, "learning_rate": 0.0001117882502805643, "loss": 0.5494, "step": 219 }, { "epoch": 0.20151133501259447, "grad_norm": 1.095410704612732, "learning_rate": 0.00011109777463013915, "loss": 0.7209, "step": 220 }, { "epoch": 0.20242729562628806, "grad_norm": 1.1136436462402344, "learning_rate": 0.00011040676286586211, "loss": 0.5977, "step": 221 }, { "epoch": 0.20334325623998167, "grad_norm": 1.0239177942276, "learning_rate": 0.0001097152483692886, "loss": 0.7531, "step": 222 }, { "epoch": 0.2042592168536753, "grad_norm": 0.813605010509491, "learning_rate": 0.0001090232645462601, "loss": 0.5142, "step": 223 }, { "epoch": 0.2051751774673689, "grad_norm": 0.7018038034439087, "learning_rate": 0.00010833084482529048, "loss": 0.6111, "step": 224 }, { "epoch": 0.20609113808106252, "grad_norm": 0.9322212338447571, "learning_rate": 0.00010763802265595102, "loss": 0.594, "step": 225 }, { "epoch": 0.20700709869475611, "grad_norm": 0.8066678643226624, "learning_rate": 0.00010694483150725458, "loss": 0.5689, "step": 226 }, { "epoch": 0.20792305930844973, "grad_norm": 0.6629310846328735, "learning_rate": 0.00010625130486603878, "loss": 0.62, "step": 227 }, { "epoch": 0.20883901992214335, "grad_norm": 0.6048685908317566, "learning_rate": 0.00010555747623534831, "loss": 0.5044, "step": 228 }, { "epoch": 0.20975498053583697, "grad_norm": 0.8310180306434631, "learning_rate": 0.00010486337913281632, "loss": 0.4644, "step": 229 }, { "epoch": 0.21067094114953058, "grad_norm": 0.8517028093338013, "learning_rate": 0.00010416904708904548, "loss": 0.6316, "step": 230 }, { "epoch": 0.21158690176322417, "grad_norm": 0.6673111319541931, "learning_rate": 0.00010347451364598804, "loss": 0.4012, "step": 231 }, { "epoch": 0.2125028623769178, "grad_norm": 1.011790156364441, "learning_rate": 0.00010277981235532541, "loss": 0.3571, "step": 232 }, { "epoch": 0.2134188229906114, "grad_norm": 0.632638692855835, "learning_rate": 0.00010208497677684754, "loss": 0.3398, "step": 233 }, { "epoch": 0.21433478360430502, "grad_norm": 0.6011660695075989, "learning_rate": 0.00010139004047683151, "loss": 0.3741, "step": 234 }, { "epoch": 0.2152507442179986, "grad_norm": 1.4447038173675537, "learning_rate": 0.00010069503702642011, "loss": 0.5175, "step": 235 }, { "epoch": 0.21616670483169223, "grad_norm": 1.0104140043258667, "learning_rate": 0.0001, "loss": 0.4162, "step": 236 }, { "epoch": 0.21708266544538585, "grad_norm": 0.8013536334037781, "learning_rate": 9.930496297357993e-05, "loss": 0.5423, "step": 237 }, { "epoch": 0.21799862605907946, "grad_norm": 0.8041028380393982, "learning_rate": 9.860995952316851e-05, "loss": 0.6906, "step": 238 }, { "epoch": 0.21891458667277308, "grad_norm": 0.9673544764518738, "learning_rate": 9.791502322315249e-05, "loss": 0.7141, "step": 239 }, { "epoch": 0.21983054728646667, "grad_norm": 0.8152205348014832, "learning_rate": 9.722018764467461e-05, "loss": 0.6435, "step": 240 }, { "epoch": 0.2207465079001603, "grad_norm": 0.7280937433242798, "learning_rate": 9.652548635401201e-05, "loss": 0.5696, "step": 241 }, { "epoch": 0.2216624685138539, "grad_norm": 0.7318608164787292, "learning_rate": 9.583095291095453e-05, "loss": 0.5423, "step": 242 }, { "epoch": 0.22257842912754752, "grad_norm": 0.5921342372894287, "learning_rate": 9.513662086718372e-05, "loss": 0.6546, "step": 243 }, { "epoch": 0.22349438974124114, "grad_norm": 0.6680089831352234, "learning_rate": 9.444252376465171e-05, "loss": 0.5059, "step": 244 }, { "epoch": 0.22441035035493473, "grad_norm": 1.1326377391815186, "learning_rate": 9.374869513396123e-05, "loss": 0.5881, "step": 245 }, { "epoch": 0.22532631096862835, "grad_norm": 0.6495856642723083, "learning_rate": 9.305516849274541e-05, "loss": 0.4135, "step": 246 }, { "epoch": 0.22624227158232196, "grad_norm": 0.8290500044822693, "learning_rate": 9.236197734404901e-05, "loss": 0.6022, "step": 247 }, { "epoch": 0.22715823219601558, "grad_norm": 0.7909289002418518, "learning_rate": 9.166915517470953e-05, "loss": 0.4046, "step": 248 }, { "epoch": 0.22807419280970917, "grad_norm": 0.6260089874267578, "learning_rate": 9.09767354537399e-05, "loss": 0.4616, "step": 249 }, { "epoch": 0.2289901534234028, "grad_norm": 0.9241347908973694, "learning_rate": 9.028475163071141e-05, "loss": 0.5557, "step": 250 }, { "epoch": 0.2299061140370964, "grad_norm": 0.6013506054878235, "learning_rate": 8.959323713413791e-05, "loss": 0.4998, "step": 251 }, { "epoch": 0.23082207465079002, "grad_norm": 0.5378098487854004, "learning_rate": 8.890222536986085e-05, "loss": 0.5686, "step": 252 }, { "epoch": 0.23173803526448364, "grad_norm": 0.6205247640609741, "learning_rate": 8.821174971943572e-05, "loss": 0.4678, "step": 253 }, { "epoch": 0.23265399587817723, "grad_norm": 0.5681025981903076, "learning_rate": 8.752184353851916e-05, "loss": 0.4752, "step": 254 }, { "epoch": 0.23356995649187084, "grad_norm": 0.8576029539108276, "learning_rate": 8.683254015525776e-05, "loss": 0.6218, "step": 255 }, { "epoch": 0.23448591710556446, "grad_norm": 0.6471827626228333, "learning_rate": 8.614387286867814e-05, "loss": 0.5229, "step": 256 }, { "epoch": 0.23540187771925808, "grad_norm": 0.4855286478996277, "learning_rate": 8.545587494707803e-05, "loss": 0.297, "step": 257 }, { "epoch": 0.2363178383329517, "grad_norm": 7.151229381561279, "learning_rate": 8.47685796264195e-05, "loss": 0.5928, "step": 258 }, { "epoch": 0.23723379894664529, "grad_norm": 1.3571738004684448, "learning_rate": 8.408202010872312e-05, "loss": 0.63, "step": 259 }, { "epoch": 0.2381497595603389, "grad_norm": 0.8930146098136902, "learning_rate": 8.339622956046417e-05, "loss": 0.5622, "step": 260 }, { "epoch": 0.23906572017403252, "grad_norm": 0.5761184692382812, "learning_rate": 8.271124111097026e-05, "loss": 0.4166, "step": 261 }, { "epoch": 0.23998168078772614, "grad_norm": 0.89448082447052, "learning_rate": 8.202708785082121e-05, "loss": 0.5383, "step": 262 }, { "epoch": 0.24089764140141973, "grad_norm": 0.579725444316864, "learning_rate": 8.134380283025014e-05, "loss": 0.4337, "step": 263 }, { "epoch": 0.24181360201511334, "grad_norm": 0.692746102809906, "learning_rate": 8.066141905754723e-05, "loss": 0.7086, "step": 264 }, { "epoch": 0.24272956262880696, "grad_norm": 0.6122776865959167, "learning_rate": 7.997996949746477e-05, "loss": 0.7151, "step": 265 }, { "epoch": 0.24364552324250058, "grad_norm": 0.5838832259178162, "learning_rate": 7.929948706962508e-05, "loss": 0.4687, "step": 266 }, { "epoch": 0.2445614838561942, "grad_norm": 0.7323639988899231, "learning_rate": 7.862000464692991e-05, "loss": 0.6293, "step": 267 }, { "epoch": 0.24547744446988778, "grad_norm": 0.4069867730140686, "learning_rate": 7.794155505397261e-05, "loss": 0.3459, "step": 268 }, { "epoch": 0.2463934050835814, "grad_norm": 0.6492972373962402, "learning_rate": 7.72641710654523e-05, "loss": 0.5204, "step": 269 }, { "epoch": 0.24730936569727502, "grad_norm": 0.744772732257843, "learning_rate": 7.658788540459062e-05, "loss": 0.6464, "step": 270 }, { "epoch": 0.24822532631096864, "grad_norm": 0.6655353307723999, "learning_rate": 7.591273074155104e-05, "loss": 0.4257, "step": 271 }, { "epoch": 0.24914128692466225, "grad_norm": 0.7041018009185791, "learning_rate": 7.523873969186039e-05, "loss": 0.5422, "step": 272 }, { "epoch": 0.25005724753835584, "grad_norm": 0.7334919571876526, "learning_rate": 7.456594481483355e-05, "loss": 0.4416, "step": 273 }, { "epoch": 0.2509732081520495, "grad_norm": 0.6543317437171936, "learning_rate": 7.389437861200024e-05, "loss": 0.5293, "step": 274 }, { "epoch": 0.2518891687657431, "grad_norm": 0.8528045415878296, "learning_rate": 7.322407352553529e-05, "loss": 0.7414, "step": 275 }, { "epoch": 0.25280512937943667, "grad_norm": 0.7721304297447205, "learning_rate": 7.25550619366911e-05, "loss": 0.6171, "step": 276 }, { "epoch": 0.2537210899931303, "grad_norm": 0.6353263854980469, "learning_rate": 7.188737616423356e-05, "loss": 0.6457, "step": 277 }, { "epoch": 0.2546370506068239, "grad_norm": 0.8189141750335693, "learning_rate": 7.122104846288064e-05, "loss": 0.4837, "step": 278 }, { "epoch": 0.25555301122051755, "grad_norm": 0.6262407898902893, "learning_rate": 7.055611102174442e-05, "loss": 0.3863, "step": 279 }, { "epoch": 0.25646897183421113, "grad_norm": 0.8298683762550354, "learning_rate": 6.989259596277582e-05, "loss": 0.6259, "step": 280 }, { "epoch": 0.2573849324479047, "grad_norm": 0.8603139519691467, "learning_rate": 6.923053533921312e-05, "loss": 0.7719, "step": 281 }, { "epoch": 0.25830089306159837, "grad_norm": 0.42033544182777405, "learning_rate": 6.85699611340333e-05, "loss": 0.3598, "step": 282 }, { "epoch": 0.25921685367529196, "grad_norm": 0.9463576078414917, "learning_rate": 6.791090525840722e-05, "loss": 0.5428, "step": 283 }, { "epoch": 0.26013281428898555, "grad_norm": 0.42964935302734375, "learning_rate": 6.725339955015777e-05, "loss": 0.3862, "step": 284 }, { "epoch": 0.2610487749026792, "grad_norm": 0.8793493509292603, "learning_rate": 6.659747577222216e-05, "loss": 0.6907, "step": 285 }, { "epoch": 0.2619647355163728, "grad_norm": 0.7326351404190063, "learning_rate": 6.594316561111724e-05, "loss": 0.599, "step": 286 }, { "epoch": 0.2628806961300664, "grad_norm": 0.866133451461792, "learning_rate": 6.529050067540887e-05, "loss": 0.5384, "step": 287 }, { "epoch": 0.26379665674376, "grad_norm": 0.8126096725463867, "learning_rate": 6.46395124941851e-05, "loss": 0.7318, "step": 288 }, { "epoch": 0.2647126173574536, "grad_norm": 0.8120489120483398, "learning_rate": 6.39902325155328e-05, "loss": 0.5245, "step": 289 }, { "epoch": 0.26562857797114725, "grad_norm": 0.7803285121917725, "learning_rate": 6.334269210501875e-05, "loss": 0.5096, "step": 290 }, { "epoch": 0.26654453858484084, "grad_norm": 0.8887495994567871, "learning_rate": 6.269692254417408e-05, "loss": 0.5458, "step": 291 }, { "epoch": 0.2674604991985345, "grad_norm": 0.9203519821166992, "learning_rate": 6.205295502898348e-05, "loss": 0.7658, "step": 292 }, { "epoch": 0.2683764598122281, "grad_norm": 0.818914532661438, "learning_rate": 6.141082066837791e-05, "loss": 0.7152, "step": 293 }, { "epoch": 0.26929242042592166, "grad_norm": 0.8341096639633179, "learning_rate": 6.0770550482731924e-05, "loss": 0.785, "step": 294 }, { "epoch": 0.2702083810396153, "grad_norm": 0.7843111753463745, "learning_rate": 6.013217540236502e-05, "loss": 0.5326, "step": 295 }, { "epoch": 0.2711243416533089, "grad_norm": 0.8106828927993774, "learning_rate": 5.9495726266047605e-05, "loss": 0.5984, "step": 296 }, { "epoch": 0.27204030226700254, "grad_norm": 0.6807352900505066, "learning_rate": 5.886123381951103e-05, "loss": 0.5012, "step": 297 }, { "epoch": 0.27295626288069613, "grad_norm": 0.6257815957069397, "learning_rate": 5.8228728713962543e-05, "loss": 0.4357, "step": 298 }, { "epoch": 0.2738722234943897, "grad_norm": 0.7749564051628113, "learning_rate": 5.759824150460435e-05, "loss": 0.4646, "step": 299 }, { "epoch": 0.27478818410808337, "grad_norm": 0.6776332259178162, "learning_rate": 5.696980264915777e-05, "loss": 0.5005, "step": 300 }, { "epoch": 0.27478818410808337, "eval_loss": 0.06278909742832184, "eval_runtime": 436.0829, "eval_samples_per_second": 3.339, "eval_steps_per_second": 0.835, "step": 300 }, { "epoch": 0.27570414472177696, "grad_norm": 0.9247587323188782, "learning_rate": 5.63434425063917e-05, "loss": 0.4515, "step": 301 }, { "epoch": 0.2766201053354706, "grad_norm": 0.5948374271392822, "learning_rate": 5.571919133465605e-05, "loss": 0.4276, "step": 302 }, { "epoch": 0.2775360659491642, "grad_norm": 0.504909873008728, "learning_rate": 5.50970792904203e-05, "loss": 0.4076, "step": 303 }, { "epoch": 0.2784520265628578, "grad_norm": 0.5952023863792419, "learning_rate": 5.447713642681612e-05, "loss": 0.3557, "step": 304 }, { "epoch": 0.2793679871765514, "grad_norm": 0.5355116724967957, "learning_rate": 5.385939269218625e-05, "loss": 0.3973, "step": 305 }, { "epoch": 0.280283947790245, "grad_norm": 0.6517271995544434, "learning_rate": 5.324387792863719e-05, "loss": 0.5735, "step": 306 }, { "epoch": 0.2811999084039386, "grad_norm": 0.5679106712341309, "learning_rate": 5.263062187059785e-05, "loss": 0.2186, "step": 307 }, { "epoch": 0.28211586901763225, "grad_norm": 0.6380028128623962, "learning_rate": 5.201965414338308e-05, "loss": 0.5342, "step": 308 }, { "epoch": 0.28303182963132584, "grad_norm": 0.5233590006828308, "learning_rate": 5.14110042617625e-05, "loss": 0.3793, "step": 309 }, { "epoch": 0.2839477902450195, "grad_norm": 0.6338326334953308, "learning_rate": 5.080470162853472e-05, "loss": 0.4085, "step": 310 }, { "epoch": 0.28486375085871307, "grad_norm": 1.0255392789840698, "learning_rate": 5.020077553310694e-05, "loss": 0.534, "step": 311 }, { "epoch": 0.28577971147240666, "grad_norm": 0.6331342458724976, "learning_rate": 4.959925515008002e-05, "loss": 0.5402, "step": 312 }, { "epoch": 0.2866956720861003, "grad_norm": 0.5389162302017212, "learning_rate": 4.900016953783912e-05, "loss": 0.4272, "step": 313 }, { "epoch": 0.2876116326997939, "grad_norm": 0.8973006010055542, "learning_rate": 4.840354763714991e-05, "loss": 0.6449, "step": 314 }, { "epoch": 0.28852759331348754, "grad_norm": 0.4471147060394287, "learning_rate": 4.7809418269760545e-05, "loss": 0.3032, "step": 315 }, { "epoch": 0.28944355392718113, "grad_norm": 0.8016964197158813, "learning_rate": 4.7217810137009274e-05, "loss": 0.449, "step": 316 }, { "epoch": 0.2903595145408747, "grad_norm": 0.6460964679718018, "learning_rate": 4.6628751818437985e-05, "loss": 0.4222, "step": 317 }, { "epoch": 0.29127547515456836, "grad_norm": 0.8219476342201233, "learning_rate": 4.604227177041156e-05, "loss": 0.4928, "step": 318 }, { "epoch": 0.29219143576826195, "grad_norm": 0.8703890442848206, "learning_rate": 4.545839832474318e-05, "loss": 0.7624, "step": 319 }, { "epoch": 0.2931073963819556, "grad_norm": 0.6923498511314392, "learning_rate": 4.487715968732568e-05, "loss": 0.6358, "step": 320 }, { "epoch": 0.2940233569956492, "grad_norm": 1.0428471565246582, "learning_rate": 4.4298583936768976e-05, "loss": 0.474, "step": 321 }, { "epoch": 0.2949393176093428, "grad_norm": 0.5889642238616943, "learning_rate": 4.372269902304363e-05, "loss": 0.5298, "step": 322 }, { "epoch": 0.2958552782230364, "grad_norm": 0.7348572611808777, "learning_rate": 4.314953276613066e-05, "loss": 0.4542, "step": 323 }, { "epoch": 0.29677123883673, "grad_norm": 0.6076330542564392, "learning_rate": 4.257911285467754e-05, "loss": 0.4647, "step": 324 }, { "epoch": 0.29768719945042366, "grad_norm": 0.6895171999931335, "learning_rate": 4.2011466844660655e-05, "loss": 0.5157, "step": 325 }, { "epoch": 0.29860316006411725, "grad_norm": 0.5947766304016113, "learning_rate": 4.144662215805426e-05, "loss": 0.4753, "step": 326 }, { "epoch": 0.29951912067781084, "grad_norm": 0.9488824009895325, "learning_rate": 4.0884606081505374e-05, "loss": 0.7115, "step": 327 }, { "epoch": 0.3004350812915045, "grad_norm": 0.6592872142791748, "learning_rate": 4.0325445765016145e-05, "loss": 0.565, "step": 328 }, { "epoch": 0.30135104190519807, "grad_norm": 0.7744607925415039, "learning_rate": 3.9769168220631745e-05, "loss": 0.5028, "step": 329 }, { "epoch": 0.3022670025188917, "grad_norm": 0.5825055837631226, "learning_rate": 3.921580032113602e-05, "loss": 0.3888, "step": 330 }, { "epoch": 0.3031829631325853, "grad_norm": 0.9765796065330505, "learning_rate": 3.866536879875269e-05, "loss": 0.82, "step": 331 }, { "epoch": 0.3040989237462789, "grad_norm": 0.6944766640663147, "learning_rate": 3.8117900243854595e-05, "loss": 0.544, "step": 332 }, { "epoch": 0.30501488435997254, "grad_norm": 0.6479721665382385, "learning_rate": 3.757342110367871e-05, "loss": 0.306, "step": 333 }, { "epoch": 0.3059308449736661, "grad_norm": 0.7690525054931641, "learning_rate": 3.7031957681048604e-05, "loss": 0.6559, "step": 334 }, { "epoch": 0.3068468055873597, "grad_norm": 0.6109075546264648, "learning_rate": 3.649353613310409e-05, "loss": 0.5924, "step": 335 }, { "epoch": 0.30776276620105336, "grad_norm": 0.9807355403900146, "learning_rate": 3.595818247003713e-05, "loss": 0.4463, "step": 336 }, { "epoch": 0.30867872681474695, "grad_norm": 0.6450587511062622, "learning_rate": 3.542592255383586e-05, "loss": 0.6291, "step": 337 }, { "epoch": 0.3095946874284406, "grad_norm": 0.5581932663917542, "learning_rate": 3.489678209703475e-05, "loss": 0.3438, "step": 338 }, { "epoch": 0.3105106480421342, "grad_norm": 0.5771118402481079, "learning_rate": 3.437078666147292e-05, "loss": 0.3232, "step": 339 }, { "epoch": 0.3114266086558278, "grad_norm": 0.5707378387451172, "learning_rate": 3.3847961657058845e-05, "loss": 0.4577, "step": 340 }, { "epoch": 0.3123425692695214, "grad_norm": 0.7203882336616516, "learning_rate": 3.332833234054331e-05, "loss": 0.6825, "step": 341 }, { "epoch": 0.313258529883215, "grad_norm": 0.7787111401557922, "learning_rate": 3.281192381429894e-05, "loss": 0.5245, "step": 342 }, { "epoch": 0.31417449049690865, "grad_norm": 0.5097988247871399, "learning_rate": 3.2298761025107706e-05, "loss": 0.4066, "step": 343 }, { "epoch": 0.31509045111060224, "grad_norm": 0.7460060119628906, "learning_rate": 3.178886876295578e-05, "loss": 0.5263, "step": 344 }, { "epoch": 0.31600641172429583, "grad_norm": 0.5440467000007629, "learning_rate": 3.1282271659835946e-05, "loss": 0.4018, "step": 345 }, { "epoch": 0.3169223723379895, "grad_norm": 0.5210183262825012, "learning_rate": 3.077899418855772e-05, "loss": 0.3788, "step": 346 }, { "epoch": 0.31783833295168307, "grad_norm": 0.7923403978347778, "learning_rate": 3.0279060661565028e-05, "loss": 0.3204, "step": 347 }, { "epoch": 0.3187542935653767, "grad_norm": 0.6634204387664795, "learning_rate": 2.9782495229761808e-05, "loss": 0.4491, "step": 348 }, { "epoch": 0.3196702541790703, "grad_norm": 0.6991851925849915, "learning_rate": 2.9289321881345254e-05, "loss": 0.4511, "step": 349 }, { "epoch": 0.3205862147927639, "grad_norm": 0.48899880051612854, "learning_rate": 2.879956444064703e-05, "loss": 0.4142, "step": 350 }, { "epoch": 0.32150217540645754, "grad_norm": 1.1039400100708008, "learning_rate": 2.8313246566982345e-05, "loss": 0.5192, "step": 351 }, { "epoch": 0.3224181360201511, "grad_norm": 0.6065302491188049, "learning_rate": 2.783039175350699e-05, "loss": 0.4985, "step": 352 }, { "epoch": 0.32333409663384477, "grad_norm": 0.5304561257362366, "learning_rate": 2.735102332608247e-05, "loss": 0.3723, "step": 353 }, { "epoch": 0.32425005724753836, "grad_norm": 0.594549834728241, "learning_rate": 2.6875164442149147e-05, "loss": 0.4838, "step": 354 }, { "epoch": 0.32516601786123195, "grad_norm": 0.7670130729675293, "learning_rate": 2.640283808960754e-05, "loss": 0.4514, "step": 355 }, { "epoch": 0.3260819784749256, "grad_norm": 0.7829962968826294, "learning_rate": 2.5934067085707834e-05, "loss": 0.8716, "step": 356 }, { "epoch": 0.3269979390886192, "grad_norm": 0.6512992978096008, "learning_rate": 2.54688740759476e-05, "loss": 0.7148, "step": 357 }, { "epoch": 0.3279138997023128, "grad_norm": 0.5506056547164917, "learning_rate": 2.500728153297788e-05, "loss": 0.4458, "step": 358 }, { "epoch": 0.3288298603160064, "grad_norm": 0.8548818230628967, "learning_rate": 2.4549311755517457e-05, "loss": 0.4366, "step": 359 }, { "epoch": 0.3297458209297, "grad_norm": 0.5396615862846375, "learning_rate": 2.409498686727587e-05, "loss": 0.5165, "step": 360 }, { "epoch": 0.33066178154339365, "grad_norm": 0.5458703637123108, "learning_rate": 2.364432881588431e-05, "loss": 0.512, "step": 361 }, { "epoch": 0.33157774215708724, "grad_norm": 0.6842907071113586, "learning_rate": 2.3197359371835802e-05, "loss": 0.5221, "step": 362 }, { "epoch": 0.33249370277078083, "grad_norm": 0.6149701476097107, "learning_rate": 2.275410012743303e-05, "loss": 0.5035, "step": 363 }, { "epoch": 0.3334096633844745, "grad_norm": 0.46057558059692383, "learning_rate": 2.2314572495745746e-05, "loss": 0.4132, "step": 364 }, { "epoch": 0.33432562399816806, "grad_norm": 0.7470035552978516, "learning_rate": 2.1878797709575847e-05, "loss": 0.4181, "step": 365 }, { "epoch": 0.3352415846118617, "grad_norm": 0.4799354076385498, "learning_rate": 2.1446796820432167e-05, "loss": 0.3314, "step": 366 }, { "epoch": 0.3361575452255553, "grad_norm": 0.7197858691215515, "learning_rate": 2.101859069751301e-05, "loss": 0.575, "step": 367 }, { "epoch": 0.3370735058392489, "grad_norm": 0.7954948544502258, "learning_rate": 2.0594200026698363e-05, "loss": 0.4358, "step": 368 }, { "epoch": 0.33798946645294253, "grad_norm": 0.5921378135681152, "learning_rate": 2.0173645309550548e-05, "loss": 0.3168, "step": 369 }, { "epoch": 0.3389054270666361, "grad_norm": 0.7418331503868103, "learning_rate": 1.9756946862323535e-05, "loss": 0.4903, "step": 370 }, { "epoch": 0.33982138768032977, "grad_norm": 0.5789597034454346, "learning_rate": 1.934412481498198e-05, "loss": 0.4672, "step": 371 }, { "epoch": 0.34073734829402336, "grad_norm": 0.6937307119369507, "learning_rate": 1.8935199110228275e-05, "loss": 0.5296, "step": 372 }, { "epoch": 0.34165330890771695, "grad_norm": 1.0454219579696655, "learning_rate": 1.8530189502539607e-05, "loss": 0.4224, "step": 373 }, { "epoch": 0.3425692695214106, "grad_norm": 0.5890056490898132, "learning_rate": 1.8129115557213262e-05, "loss": 0.8359, "step": 374 }, { "epoch": 0.3434852301351042, "grad_norm": 0.4845948815345764, "learning_rate": 1.7731996649421802e-05, "loss": 0.3724, "step": 375 }, { "epoch": 0.3444011907487978, "grad_norm": 0.6197191476821899, "learning_rate": 1.7338851963276825e-05, "loss": 0.5535, "step": 376 }, { "epoch": 0.3453171513624914, "grad_norm": 1.0201290845870972, "learning_rate": 1.6949700490902344e-05, "loss": 0.7269, "step": 377 }, { "epoch": 0.346233111976185, "grad_norm": 0.9290168881416321, "learning_rate": 1.656456103151728e-05, "loss": 0.5618, "step": 378 }, { "epoch": 0.34714907258987865, "grad_norm": 0.7306623458862305, "learning_rate": 1.6183452190527316e-05, "loss": 0.4795, "step": 379 }, { "epoch": 0.34806503320357224, "grad_norm": 0.6636963486671448, "learning_rate": 1.580639237862608e-05, "loss": 0.3895, "step": 380 }, { "epoch": 0.3489809938172659, "grad_norm": 1.0929412841796875, "learning_rate": 1.543339981090578e-05, "loss": 0.6531, "step": 381 }, { "epoch": 0.3498969544309595, "grad_norm": 0.6249703764915466, "learning_rate": 1.5064492505977234e-05, "loss": 0.4611, "step": 382 }, { "epoch": 0.35081291504465306, "grad_norm": 0.734741747379303, "learning_rate": 1.4699688285099489e-05, "loss": 0.8428, "step": 383 }, { "epoch": 0.3517288756583467, "grad_norm": 0.5528324246406555, "learning_rate": 1.433900477131882e-05, "loss": 0.5264, "step": 384 }, { "epoch": 0.3526448362720403, "grad_norm": 0.7774396538734436, "learning_rate": 1.3982459388617452e-05, "loss": 0.4981, "step": 385 }, { "epoch": 0.35356079688573394, "grad_norm": 0.7864802479743958, "learning_rate": 1.363006936107183e-05, "loss": 0.4647, "step": 386 }, { "epoch": 0.35447675749942753, "grad_norm": 0.624590277671814, "learning_rate": 1.328185171202052e-05, "loss": 0.4544, "step": 387 }, { "epoch": 0.3553927181131211, "grad_norm": 0.7023553848266602, "learning_rate": 1.29378232632419e-05, "loss": 0.5434, "step": 388 }, { "epoch": 0.35630867872681476, "grad_norm": 0.8227251172065735, "learning_rate": 1.259800063414146e-05, "loss": 0.7397, "step": 389 }, { "epoch": 0.35722463934050835, "grad_norm": 0.7069660425186157, "learning_rate": 1.2262400240949023e-05, "loss": 0.4147, "step": 390 }, { "epoch": 0.35814059995420194, "grad_norm": 0.5528481602668762, "learning_rate": 1.1931038295925645e-05, "loss": 0.4243, "step": 391 }, { "epoch": 0.3590565605678956, "grad_norm": 0.4696814715862274, "learning_rate": 1.1603930806580444e-05, "loss": 0.4239, "step": 392 }, { "epoch": 0.3599725211815892, "grad_norm": 1.4645037651062012, "learning_rate": 1.1281093574897338e-05, "loss": 0.455, "step": 393 }, { "epoch": 0.3608884817952828, "grad_norm": 0.7214867472648621, "learning_rate": 1.0962542196571634e-05, "loss": 0.4941, "step": 394 }, { "epoch": 0.3618044424089764, "grad_norm": 0.5920029878616333, "learning_rate": 1.0648292060256649e-05, "loss": 0.4427, "step": 395 }, { "epoch": 0.36272040302267, "grad_norm": 0.7575090527534485, "learning_rate": 1.0338358346820353e-05, "loss": 0.4352, "step": 396 }, { "epoch": 0.36363636363636365, "grad_norm": 0.5316157341003418, "learning_rate": 1.0032756028611878e-05, "loss": 0.4208, "step": 397 }, { "epoch": 0.36455232425005724, "grad_norm": 0.5491222739219666, "learning_rate": 9.731499868738447e-06, "loss": 0.3124, "step": 398 }, { "epoch": 0.3654682848637509, "grad_norm": 0.7701692581176758, "learning_rate": 9.434604420351911e-06, "loss": 0.3397, "step": 399 }, { "epoch": 0.36638424547744447, "grad_norm": 0.5326722264289856, "learning_rate": 9.142084025945984e-06, "loss": 0.3623, "step": 400 }, { "epoch": 0.36638424547744447, "eval_loss": 0.05946353077888489, "eval_runtime": 435.6161, "eval_samples_per_second": 3.342, "eval_steps_per_second": 0.836, "step": 400 }, { "epoch": 0.36730020609113806, "grad_norm": 0.6355307698249817, "learning_rate": 8.853952816663213e-06, "loss": 0.4239, "step": 401 }, { "epoch": 0.3682161667048317, "grad_norm": 0.5561894774436951, "learning_rate": 8.570224711612385e-06, "loss": 0.4807, "step": 402 }, { "epoch": 0.3691321273185253, "grad_norm": 0.6513518691062927, "learning_rate": 8.290913417196177e-06, "loss": 0.3068, "step": 403 }, { "epoch": 0.37004808793221894, "grad_norm": 0.6410642862319946, "learning_rate": 8.016032426448817e-06, "loss": 0.476, "step": 404 }, { "epoch": 0.37096404854591253, "grad_norm": 1.7064963579177856, "learning_rate": 7.745595018384578e-06, "loss": 0.7487, "step": 405 }, { "epoch": 0.3718800091596061, "grad_norm": 0.8433213233947754, "learning_rate": 7.479614257355971e-06, "loss": 0.4346, "step": 406 }, { "epoch": 0.37279596977329976, "grad_norm": 0.736240804195404, "learning_rate": 7.2181029924228814e-06, "loss": 0.4794, "step": 407 }, { "epoch": 0.37371193038699335, "grad_norm": 0.6535281538963318, "learning_rate": 6.961073856731648e-06, "loss": 0.5845, "step": 408 }, { "epoch": 0.374627891000687, "grad_norm": 0.5158947706222534, "learning_rate": 6.708539266905001e-06, "loss": 0.4633, "step": 409 }, { "epoch": 0.3755438516143806, "grad_norm": 0.7100207805633545, "learning_rate": 6.460511422441984e-06, "loss": 0.3909, "step": 410 }, { "epoch": 0.3764598122280742, "grad_norm": 0.49073487520217896, "learning_rate": 6.217002305128849e-06, "loss": 0.347, "step": 411 }, { "epoch": 0.3773757728417678, "grad_norm": 0.8159440159797668, "learning_rate": 5.978023678460099e-06, "loss": 0.6058, "step": 412 }, { "epoch": 0.3782917334554614, "grad_norm": 0.6300156712532043, "learning_rate": 5.743587087070235e-06, "loss": 0.566, "step": 413 }, { "epoch": 0.379207694069155, "grad_norm": 0.605778694152832, "learning_rate": 5.5137038561761115e-06, "loss": 0.5922, "step": 414 }, { "epoch": 0.38012365468284864, "grad_norm": 0.5530845522880554, "learning_rate": 5.2883850910297235e-06, "loss": 0.4646, "step": 415 }, { "epoch": 0.38103961529654223, "grad_norm": 0.5650749206542969, "learning_rate": 5.067641676381918e-06, "loss": 0.34, "step": 416 }, { "epoch": 0.3819555759102359, "grad_norm": 0.7632530331611633, "learning_rate": 4.8514842759563306e-06, "loss": 0.4461, "step": 417 }, { "epoch": 0.38287153652392947, "grad_norm": 0.9296861290931702, "learning_rate": 4.639923331934471e-06, "loss": 0.7671, "step": 418 }, { "epoch": 0.38378749713762306, "grad_norm": 0.5393553376197815, "learning_rate": 4.432969064451109e-06, "loss": 0.4055, "step": 419 }, { "epoch": 0.3847034577513167, "grad_norm": 0.6695364713668823, "learning_rate": 4.230631471100655e-06, "loss": 0.5134, "step": 420 }, { "epoch": 0.3856194183650103, "grad_norm": 0.5043479800224304, "learning_rate": 4.032920326454159e-06, "loss": 0.3956, "step": 421 }, { "epoch": 0.38653537897870394, "grad_norm": 1.0091841220855713, "learning_rate": 3.839845181587098e-06, "loss": 0.3713, "step": 422 }, { "epoch": 0.3874513395923975, "grad_norm": 0.4001891314983368, "learning_rate": 3.6514153636180383e-06, "loss": 0.2791, "step": 423 }, { "epoch": 0.3883673002060911, "grad_norm": 0.626771867275238, "learning_rate": 3.467639975257997e-06, "loss": 0.3861, "step": 424 }, { "epoch": 0.38928326081978476, "grad_norm": 0.4506821930408478, "learning_rate": 3.288527894370752e-06, "loss": 0.3876, "step": 425 }, { "epoch": 0.39019922143347835, "grad_norm": 0.7395780682563782, "learning_rate": 3.1140877735439387e-06, "loss": 0.51, "step": 426 }, { "epoch": 0.391115182047172, "grad_norm": 0.6306988596916199, "learning_rate": 2.944328039671085e-06, "loss": 0.4696, "step": 427 }, { "epoch": 0.3920311426608656, "grad_norm": 0.7126988768577576, "learning_rate": 2.7792568935444796e-06, "loss": 0.4441, "step": 428 }, { "epoch": 0.3929471032745592, "grad_norm": 0.5041316151618958, "learning_rate": 2.618882309459081e-06, "loss": 0.3623, "step": 429 }, { "epoch": 0.3938630638882528, "grad_norm": 0.6836007237434387, "learning_rate": 2.4632120348272003e-06, "loss": 0.7209, "step": 430 }, { "epoch": 0.3947790245019464, "grad_norm": 0.5515868067741394, "learning_rate": 2.312253589804314e-06, "loss": 0.3443, "step": 431 }, { "epoch": 0.39569498511564005, "grad_norm": 0.5961638689041138, "learning_rate": 2.166014266925731e-06, "loss": 0.3673, "step": 432 }, { "epoch": 0.39661094572933364, "grad_norm": 0.5713266134262085, "learning_rate": 2.0245011307543416e-06, "loss": 0.3757, "step": 433 }, { "epoch": 0.39752690634302723, "grad_norm": 0.7907454967498779, "learning_rate": 1.88772101753929e-06, "loss": 0.5574, "step": 434 }, { "epoch": 0.3984428669567209, "grad_norm": 0.5408697128295898, "learning_rate": 1.7556805348858064e-06, "loss": 0.4235, "step": 435 }, { "epoch": 0.39935882757041447, "grad_norm": 0.8136631846427917, "learning_rate": 1.6283860614358936e-06, "loss": 0.5299, "step": 436 }, { "epoch": 0.4002747881841081, "grad_norm": 0.6327822208404541, "learning_rate": 1.5058437465602982e-06, "loss": 0.5456, "step": 437 }, { "epoch": 0.4011907487978017, "grad_norm": 0.5776308178901672, "learning_rate": 1.3880595100613792e-06, "loss": 0.5112, "step": 438 }, { "epoch": 0.4021067094114953, "grad_norm": 0.4584222435951233, "learning_rate": 1.2750390418871604e-06, "loss": 0.3786, "step": 439 }, { "epoch": 0.40302267002518893, "grad_norm": 0.5486764311790466, "learning_rate": 1.1667878018564171e-06, "loss": 0.488, "step": 440 }, { "epoch": 0.4039386306388825, "grad_norm": 0.510213315486908, "learning_rate": 1.063311019395008e-06, "loss": 0.535, "step": 441 }, { "epoch": 0.4048545912525761, "grad_norm": 0.5908893346786499, "learning_rate": 9.64613693283123e-07, "loss": 0.4208, "step": 442 }, { "epoch": 0.40577055186626976, "grad_norm": 0.9434095025062561, "learning_rate": 8.707005914139422e-07, "loss": 0.6034, "step": 443 }, { "epoch": 0.40668651247996335, "grad_norm": 0.42651069164276123, "learning_rate": 7.815762505632096e-07, "loss": 0.3516, "step": 444 }, { "epoch": 0.407602473093657, "grad_norm": 0.6566410064697266, "learning_rate": 6.972449761700861e-07, "loss": 0.4768, "step": 445 }, { "epoch": 0.4085184337073506, "grad_norm": 0.6780850291252136, "learning_rate": 6.177108421292266e-07, "loss": 0.4385, "step": 446 }, { "epoch": 0.40943439432104417, "grad_norm": 0.6476641297340393, "learning_rate": 5.429776905938489e-07, "loss": 0.6201, "step": 447 }, { "epoch": 0.4103503549347378, "grad_norm": 0.5498802065849304, "learning_rate": 4.7304913179025965e-07, "loss": 0.4303, "step": 448 }, { "epoch": 0.4112663155484314, "grad_norm": 0.722607433795929, "learning_rate": 4.0792854384338333e-07, "loss": 0.5461, "step": 449 }, { "epoch": 0.41218227616212505, "grad_norm": 0.7780727744102478, "learning_rate": 3.4761907261356976e-07, "loss": 0.5537, "step": 450 }, { "epoch": 0.41309823677581864, "grad_norm": 0.5408957600593567, "learning_rate": 2.921236315446385e-07, "loss": 0.4887, "step": 451 }, { "epoch": 0.41401419738951223, "grad_norm": 0.7068083882331848, "learning_rate": 2.414449015231357e-07, "loss": 0.6064, "step": 452 }, { "epoch": 0.4149301580032059, "grad_norm": 0.6516684889793396, "learning_rate": 1.9558533074882646e-07, "loss": 0.6653, "step": 453 }, { "epoch": 0.41584611861689946, "grad_norm": 0.5151285529136658, "learning_rate": 1.545471346164007e-07, "loss": 0.4024, "step": 454 }, { "epoch": 0.4167620792305931, "grad_norm": 0.6734718084335327, "learning_rate": 1.1833229560848092e-07, "loss": 0.6197, "step": 455 }, { "epoch": 0.4176780398442867, "grad_norm": 0.5829524993896484, "learning_rate": 8.694256319987659e-08, "loss": 0.5708, "step": 456 }, { "epoch": 0.4185940004579803, "grad_norm": 0.5721414685249329, "learning_rate": 6.037945377297405e-08, "loss": 0.2411, "step": 457 }, { "epoch": 0.41950996107167393, "grad_norm": 0.47081419825553894, "learning_rate": 3.8644250544594975e-08, "loss": 0.3974, "step": 458 }, { "epoch": 0.4204259216853675, "grad_norm": 0.5603060126304626, "learning_rate": 2.1738003503946057e-08, "loss": 0.5078, "step": 459 }, { "epoch": 0.42134188229906117, "grad_norm": 0.6785393953323364, "learning_rate": 9.661529361892907e-09, "loss": 0.4146, "step": 460 }, { "epoch": 0.42225784291275476, "grad_norm": 0.7074640989303589, "learning_rate": 2.4154115115360144e-09, "loss": 0.3551, "step": 461 }, { "epoch": 0.42317380352644834, "grad_norm": 0.7316946983337402, "learning_rate": 0.0, "loss": 0.5625, "step": 462 } ], "logging_steps": 1, "max_steps": 462, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1453536664210637e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }