{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9808429118773945, "eval_steps": 500, "global_step": 65, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.2323372662067413, "epoch": 0.03065134099616858, "grad_norm": 53.25, "learning_rate": 0.0, "loss": 2.7706, "mean_token_accuracy": 0.41634324193000793, "num_tokens": 1244.0, "step": 1 }, { "entropy": 2.174584299325943, "epoch": 0.06130268199233716, "grad_norm": 36.0, "learning_rate": 2e-06, "loss": 2.4332, "mean_token_accuracy": 0.41893551871180534, "num_tokens": 3427.0, "step": 2 }, { "entropy": 2.0810845494270325, "epoch": 0.09195402298850575, "grad_norm": 27.75, "learning_rate": 4e-06, "loss": 2.2604, "mean_token_accuracy": 0.4491094872355461, "num_tokens": 5582.0, "step": 3 }, { "entropy": 2.389508530497551, "epoch": 0.12260536398467432, "grad_norm": 28.625, "learning_rate": 6e-06, "loss": 2.224, "mean_token_accuracy": 0.47163403779268265, "num_tokens": 7064.0, "step": 4 }, { "entropy": 2.3899217396974564, "epoch": 0.1532567049808429, "grad_norm": 17.0, "learning_rate": 8e-06, "loss": 1.9894, "mean_token_accuracy": 0.4873850643634796, "num_tokens": 9091.0, "step": 5 }, { "entropy": 2.3988372683525085, "epoch": 0.1839080459770115, "grad_norm": 22.375, "learning_rate": 9.999999999999999e-06, "loss": 2.0726, "mean_token_accuracy": 0.5061133019626141, "num_tokens": 10556.0, "step": 6 }, { "entropy": 2.395625740289688, "epoch": 0.21455938697318008, "grad_norm": 16.75, "learning_rate": 1.2e-05, "loss": 2.0064, "mean_token_accuracy": 0.5037284195423126, "num_tokens": 12215.0, "step": 7 }, { "entropy": 2.2998499274253845, "epoch": 0.24521072796934865, "grad_norm": 14.5625, "learning_rate": 1.4e-05, "loss": 1.7784, "mean_token_accuracy": 0.5325785167515278, "num_tokens": 13939.0, "step": 8 }, { "entropy": 2.233474910259247, "epoch": 0.27586206896551724, "grad_norm": 14.6875, "learning_rate": 1.6e-05, "loss": 1.7552, "mean_token_accuracy": 0.5224817767739296, "num_tokens": 15986.0, "step": 9 }, { "entropy": 2.1560849398374557, "epoch": 0.3065134099616858, "grad_norm": 12.125, "learning_rate": 1.8e-05, "loss": 1.7487, "mean_token_accuracy": 0.5436614826321602, "num_tokens": 18444.0, "step": 10 }, { "entropy": 1.8782547265291214, "epoch": 0.3371647509578544, "grad_norm": 11.1875, "learning_rate": 1.9999999999999998e-05, "loss": 1.5774, "mean_token_accuracy": 0.5730905011296272, "num_tokens": 21127.0, "step": 11 }, { "entropy": 2.0860691219568253, "epoch": 0.367816091954023, "grad_norm": 13.125, "learning_rate": 2.2e-05, "loss": 1.8279, "mean_token_accuracy": 0.5077806040644646, "num_tokens": 23308.0, "step": 12 }, { "entropy": 2.0839987099170685, "epoch": 0.39846743295019155, "grad_norm": 13.5, "learning_rate": 2.4e-05, "loss": 1.8629, "mean_token_accuracy": 0.5324465520679951, "num_tokens": 25072.0, "step": 13 }, { "entropy": 2.211606591939926, "epoch": 0.42911877394636017, "grad_norm": 15.3125, "learning_rate": 2.6000000000000002e-05, "loss": 1.934, "mean_token_accuracy": 0.513655960559845, "num_tokens": 26450.0, "step": 14 }, { "entropy": 2.2505457401275635, "epoch": 0.45977011494252873, "grad_norm": 14.8125, "learning_rate": 2.8e-05, "loss": 1.7603, "mean_token_accuracy": 0.5480454824864864, "num_tokens": 27912.0, "step": 15 }, { "entropy": 2.187108889222145, "epoch": 0.4904214559386973, "grad_norm": 13.125, "learning_rate": 3e-05, "loss": 1.6138, "mean_token_accuracy": 0.5843819156289101, "num_tokens": 29392.0, "step": 16 }, { "entropy": 2.0149056166410446, "epoch": 0.5210727969348659, "grad_norm": 9.9375, "learning_rate": 2.998951057182598e-05, "loss": 1.4549, "mean_token_accuracy": 0.597277820110321, "num_tokens": 31417.0, "step": 17 }, { "entropy": 1.9988498389720917, "epoch": 0.5517241379310345, "grad_norm": 11.0, "learning_rate": 2.99580569577177e-05, "loss": 1.7097, "mean_token_accuracy": 0.5442679524421692, "num_tokens": 33727.0, "step": 18 }, { "entropy": 1.8304037749767303, "epoch": 0.5823754789272031, "grad_norm": 10.125, "learning_rate": 2.9905683148398642e-05, "loss": 1.5381, "mean_token_accuracy": 0.5851795524358749, "num_tokens": 35836.0, "step": 19 }, { "entropy": 1.891087457537651, "epoch": 0.6130268199233716, "grad_norm": 12.625, "learning_rate": 2.9832462393376926e-05, "loss": 1.6876, "mean_token_accuracy": 0.5546146482229233, "num_tokens": 37639.0, "step": 20 }, { "entropy": 1.9664306491613388, "epoch": 0.6436781609195402, "grad_norm": 12.125, "learning_rate": 2.9738497098499325e-05, "loss": 1.7271, "mean_token_accuracy": 0.5344564504921436, "num_tokens": 39351.0, "step": 21 }, { "entropy": 1.7850568294525146, "epoch": 0.6743295019157088, "grad_norm": 13.375, "learning_rate": 2.9623918682727355e-05, "loss": 1.524, "mean_token_accuracy": 0.5623632185161114, "num_tokens": 41024.0, "step": 22 }, { "entropy": 1.898742452263832, "epoch": 0.7049808429118773, "grad_norm": 13.0, "learning_rate": 2.9488887394336025e-05, "loss": 1.732, "mean_token_accuracy": 0.5667595192790031, "num_tokens": 42624.0, "step": 23 }, { "entropy": 2.062256097793579, "epoch": 0.735632183908046, "grad_norm": 15.0625, "learning_rate": 2.9333592086792113e-05, "loss": 1.8659, "mean_token_accuracy": 0.5371430143713951, "num_tokens": 43836.0, "step": 24 }, { "entropy": 1.9839176535606384, "epoch": 0.7662835249042146, "grad_norm": 10.4375, "learning_rate": 2.9158249954625514e-05, "loss": 1.7355, "mean_token_accuracy": 0.548308789730072, "num_tokens": 45870.0, "step": 25 }, { "entropy": 2.005643382668495, "epoch": 0.7969348659003831, "grad_norm": 10.6875, "learning_rate": 2.8963106229663064e-05, "loss": 1.6277, "mean_token_accuracy": 0.577509343624115, "num_tokens": 47664.0, "step": 26 }, { "entropy": 2.015763074159622, "epoch": 0.8275862068965517, "grad_norm": 10.875, "learning_rate": 2.8748433838049642e-05, "loss": 1.6878, "mean_token_accuracy": 0.5588897317647934, "num_tokens": 49646.0, "step": 27 }, { "entropy": 2.0416687428951263, "epoch": 0.8582375478927203, "grad_norm": 13.0, "learning_rate": 2.8514533018536286e-05, "loss": 1.5327, "mean_token_accuracy": 0.5883619785308838, "num_tokens": 51235.0, "step": 28 }, { "entropy": 2.029404863715172, "epoch": 0.8888888888888888, "grad_norm": 10.8125, "learning_rate": 2.8261730902569146e-05, "loss": 1.6362, "mean_token_accuracy": 0.5863424465060234, "num_tokens": 53037.0, "step": 29 }, { "entropy": 2.0645615607500076, "epoch": 0.9195402298850575, "grad_norm": 10.0625, "learning_rate": 2.7990381056766583e-05, "loss": 1.6623, "mean_token_accuracy": 0.5610311627388, "num_tokens": 54826.0, "step": 30 }, { "entropy": 2.090387746691704, "epoch": 0.9501915708812261, "grad_norm": 12.0, "learning_rate": 2.770086298842426e-05, "loss": 1.6578, "mean_token_accuracy": 0.5568758621811867, "num_tokens": 56737.0, "step": 31 }, { "entropy": 2.0354464948177338, "epoch": 0.9808429118773946, "grad_norm": 12.5625, "learning_rate": 2.7393581614739924e-05, "loss": 1.6745, "mean_token_accuracy": 0.5604493953287601, "num_tokens": 58084.0, "step": 32 }, { "entropy": 1.7894673347473145, "epoch": 1.0, "grad_norm": 12.4375, "learning_rate": 2.7068966696500025e-05, "loss": 1.6188, "mean_token_accuracy": 0.5824247837066651, "num_tokens": 59142.0, "step": 33 }, { "entropy": 1.63651242852211, "epoch": 1.0306513409961686, "grad_norm": 8.0625, "learning_rate": 2.672747223702045e-05, "loss": 0.9761, "mean_token_accuracy": 0.7217265591025352, "num_tokens": 60897.0, "step": 34 }, { "entropy": 1.7347675114870071, "epoch": 1.0613026819923372, "grad_norm": 9.3125, "learning_rate": 2.6369575847181795e-05, "loss": 1.1561, "mean_token_accuracy": 0.7075180560350418, "num_tokens": 62325.0, "step": 35 }, { "entropy": 1.5030861496925354, "epoch": 1.0919540229885056, "grad_norm": 7.65625, "learning_rate": 2.5995778077447393e-05, "loss": 0.8402, "mean_token_accuracy": 0.7322944924235344, "num_tokens": 64163.0, "step": 36 }, { "entropy": 1.3862270265817642, "epoch": 1.1226053639846743, "grad_norm": 8.5625, "learning_rate": 2.5606601717798212e-05, "loss": 0.9429, "mean_token_accuracy": 0.7389034852385521, "num_tokens": 66168.0, "step": 37 }, { "entropy": 1.3857311755418777, "epoch": 1.1532567049808429, "grad_norm": 7.65625, "learning_rate": 2.520259106656379e-05, "loss": 0.8564, "mean_token_accuracy": 0.7321354225277901, "num_tokens": 68398.0, "step": 38 }, { "entropy": 1.2590633258223534, "epoch": 1.1839080459770115, "grad_norm": 9.75, "learning_rate": 2.4784311169171818e-05, "loss": 0.9376, "mean_token_accuracy": 0.7156714797019958, "num_tokens": 70548.0, "step": 39 }, { "entropy": 1.2306247800588608, "epoch": 1.21455938697318, "grad_norm": 10.9375, "learning_rate": 2.4352347027881003e-05, "loss": 0.8899, "mean_token_accuracy": 0.756280928850174, "num_tokens": 72463.0, "step": 40 }, { "entropy": 1.110754244029522, "epoch": 1.2452107279693487, "grad_norm": 12.125, "learning_rate": 2.3907302783602522e-05, "loss": 0.7503, "mean_token_accuracy": 0.7652318105101585, "num_tokens": 74061.0, "step": 41 }, { "entropy": 1.1396447345614433, "epoch": 1.2758620689655173, "grad_norm": 10.375, "learning_rate": 2.344980087095433e-05, "loss": 0.774, "mean_token_accuracy": 0.7681270688772202, "num_tokens": 76130.0, "step": 42 }, { "entropy": 1.0957090184092522, "epoch": 1.3065134099616857, "grad_norm": 12.4375, "learning_rate": 2.298048114773005e-05, "loss": 0.7757, "mean_token_accuracy": 0.767442375421524, "num_tokens": 77912.0, "step": 43 }, { "entropy": 1.0323160290718079, "epoch": 1.3371647509578544, "grad_norm": 10.625, "learning_rate": 2.25e-05, "loss": 0.7192, "mean_token_accuracy": 0.771703340113163, "num_tokens": 79873.0, "step": 44 }, { "entropy": 1.1174012199044228, "epoch": 1.367816091954023, "grad_norm": 13.1875, "learning_rate": 2.200902942409593e-05, "loss": 0.7571, "mean_token_accuracy": 0.7688822597265244, "num_tokens": 81708.0, "step": 45 }, { "entropy": 1.133009672164917, "epoch": 1.3984674329501916, "grad_norm": 11.4375, "learning_rate": 2.1508256086763372e-05, "loss": 0.8328, "mean_token_accuracy": 0.7457190081477165, "num_tokens": 83479.0, "step": 46 }, { "entropy": 1.0821977257728577, "epoch": 1.4291187739463602, "grad_norm": 12.25, "learning_rate": 2.0998380364796112e-05, "loss": 0.8791, "mean_token_accuracy": 0.7517153918743134, "num_tokens": 85091.0, "step": 47 }, { "entropy": 1.160033829510212, "epoch": 1.4597701149425286, "grad_norm": 10.25, "learning_rate": 2.0480115365495928e-05, "loss": 0.7528, "mean_token_accuracy": 0.7454545870423317, "num_tokens": 87067.0, "step": 48 }, { "entropy": 1.09547870606184, "epoch": 1.4904214559386972, "grad_norm": 8.1875, "learning_rate": 1.995418592932751e-05, "loss": 0.6824, "mean_token_accuracy": 0.8004695847630501, "num_tokens": 89257.0, "step": 49 }, { "entropy": 1.1644561365246773, "epoch": 1.5210727969348659, "grad_norm": 10.125, "learning_rate": 1.9421327616163564e-05, "loss": 0.8229, "mean_token_accuracy": 0.744444377720356, "num_tokens": 91129.0, "step": 50 }, { "entropy": 1.1956558972597122, "epoch": 1.5517241379310345, "grad_norm": 9.1875, "learning_rate": 1.888228567653781e-05, "loss": 0.807, "mean_token_accuracy": 0.7377020716667175, "num_tokens": 93217.0, "step": 51 }, { "entropy": 1.2180762365460396, "epoch": 1.582375478927203, "grad_norm": 9.125, "learning_rate": 1.8337814009344716e-05, "loss": 0.6652, "mean_token_accuracy": 0.7918966636061668, "num_tokens": 94882.0, "step": 52 }, { "entropy": 1.2762009352445602, "epoch": 1.6130268199233715, "grad_norm": 11.625, "learning_rate": 1.778867410744372e-05, "loss": 0.8152, "mean_token_accuracy": 0.7556928023695946, "num_tokens": 96226.0, "step": 53 }, { "entropy": 1.2115763127803802, "epoch": 1.6436781609195403, "grad_norm": 10.8125, "learning_rate": 1.7235633992642615e-05, "loss": 0.7119, "mean_token_accuracy": 0.7653274685144424, "num_tokens": 98064.0, "step": 54 }, { "entropy": 1.301737241446972, "epoch": 1.6743295019157087, "grad_norm": 8.75, "learning_rate": 1.667946714154962e-05, "loss": 0.7362, "mean_token_accuracy": 0.7743538916110992, "num_tokens": 99875.0, "step": 55 }, { "entropy": 1.1645233482122421, "epoch": 1.7049808429118773, "grad_norm": 8.125, "learning_rate": 1.6120951403796367e-05, "loss": 0.7929, "mean_token_accuracy": 0.7437388524413109, "num_tokens": 102303.0, "step": 56 }, { "entropy": 1.2387544885277748, "epoch": 1.735632183908046, "grad_norm": 10.125, "learning_rate": 1.5560867914144887e-05, "loss": 0.7757, "mean_token_accuracy": 0.760113924741745, "num_tokens": 103806.0, "step": 57 }, { "entropy": 1.2401599884033203, "epoch": 1.7662835249042146, "grad_norm": 12.25, "learning_rate": 1.5e-05, "loss": 0.757, "mean_token_accuracy": 0.7870561257004738, "num_tokens": 105012.0, "step": 58 }, { "entropy": 1.3122059255838394, "epoch": 1.7969348659003832, "grad_norm": 11.25, "learning_rate": 1.4439132085855117e-05, "loss": 0.8231, "mean_token_accuracy": 0.7717632800340652, "num_tokens": 106373.0, "step": 59 }, { "entropy": 1.224107950925827, "epoch": 1.8275862068965516, "grad_norm": 9.3125, "learning_rate": 1.3879048596203637e-05, "loss": 0.6616, "mean_token_accuracy": 0.8022700250148773, "num_tokens": 107938.0, "step": 60 }, { "entropy": 1.2059504985809326, "epoch": 1.8582375478927204, "grad_norm": 9.625, "learning_rate": 1.3320532858450382e-05, "loss": 0.7585, "mean_token_accuracy": 0.7686295211315155, "num_tokens": 109587.0, "step": 61 }, { "entropy": 1.2734860181808472, "epoch": 1.8888888888888888, "grad_norm": 12.4375, "learning_rate": 1.2764366007357382e-05, "loss": 1.055, "mean_token_accuracy": 0.707017719745636, "num_tokens": 111253.0, "step": 62 }, { "entropy": 1.1893908977508545, "epoch": 1.9195402298850575, "grad_norm": 11.1875, "learning_rate": 1.2211325892556282e-05, "loss": 0.7912, "mean_token_accuracy": 0.7822966873645782, "num_tokens": 112833.0, "step": 63 }, { "entropy": 1.1533539071679115, "epoch": 1.950191570881226, "grad_norm": 11.125, "learning_rate": 1.1662185990655285e-05, "loss": 0.8553, "mean_token_accuracy": 0.7498924359679222, "num_tokens": 114573.0, "step": 64 }, { "entropy": 1.1270944774150848, "epoch": 1.9808429118773945, "grad_norm": 8.25, "learning_rate": 1.1117714323462188e-05, "loss": 0.7116, "mean_token_accuracy": 0.7686784416437149, "num_tokens": 116981.0, "step": 65 } ], "logging_steps": 1, "max_steps": 99, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3202052021059584.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }