| { | |
| "best_global_step": 190, | |
| "best_metric": 2.3478477001190186, | |
| "best_model_checkpoint": "rick-qwen-finetuned/checkpoint-190", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 285, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.1721158534288407, | |
| "epoch": 0.10610079575596817, | |
| "grad_norm": 4.033281326293945, | |
| "learning_rate": 0.00015517241379310346, | |
| "loss": 4.8926, | |
| "mean_token_accuracy": 0.37371344938874246, | |
| "num_tokens": 2064.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.7347501456737517, | |
| "epoch": 0.21220159151193635, | |
| "grad_norm": 7.625450134277344, | |
| "learning_rate": 0.0003275862068965517, | |
| "loss": 2.7008, | |
| "mean_token_accuracy": 0.5224792890250682, | |
| "num_tokens": 4434.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.2419000223278998, | |
| "epoch": 0.3183023872679045, | |
| "grad_norm": 1.8114349842071533, | |
| "learning_rate": 0.0005, | |
| "loss": 2.4624, | |
| "mean_token_accuracy": 0.5613403469324112, | |
| "num_tokens": 6665.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.4120024889707565, | |
| "epoch": 0.4244031830238727, | |
| "grad_norm": 1.839812159538269, | |
| "learning_rate": 0.0004981198836496775, | |
| "loss": 2.5023, | |
| "mean_token_accuracy": 0.5351990483701229, | |
| "num_tokens": 8954.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.2711548835039137, | |
| "epoch": 0.5305039787798409, | |
| "grad_norm": 2.454310417175293, | |
| "learning_rate": 0.000492507813298636, | |
| "loss": 2.5445, | |
| "mean_token_accuracy": 0.561315081268549, | |
| "num_tokens": 10917.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.3858062833547593, | |
| "epoch": 0.636604774535809, | |
| "grad_norm": 12.571678161621094, | |
| "learning_rate": 0.00048324819970868473, | |
| "loss": 2.4936, | |
| "mean_token_accuracy": 0.5213245801627636, | |
| "num_tokens": 13163.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.3039768010377886, | |
| "epoch": 0.7427055702917772, | |
| "grad_norm": 1.9571681022644043, | |
| "learning_rate": 0.00047048031608708875, | |
| "loss": 2.4157, | |
| "mean_token_accuracy": 0.5572853125631809, | |
| "num_tokens": 15404.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.301254630088806, | |
| "epoch": 0.8488063660477454, | |
| "grad_norm": 1.962308406829834, | |
| "learning_rate": 0.00045439620328789593, | |
| "loss": 2.4493, | |
| "mean_token_accuracy": 0.5377297826111317, | |
| "num_tokens": 17936.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.3037476420402525, | |
| "epoch": 0.9549071618037135, | |
| "grad_norm": 2.418731451034546, | |
| "learning_rate": 0.0004352377813387398, | |
| "loss": 2.4503, | |
| "mean_token_accuracy": 0.5464393310248852, | |
| "num_tokens": 20214.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 2.167915307340168, | |
| "eval_loss": 2.4205760955810547, | |
| "eval_mean_token_accuracy": 0.5355801603623799, | |
| "eval_num_tokens": 21095.0, | |
| "eval_runtime": 18.0724, | |
| "eval_samples_per_second": 2.324, | |
| "eval_steps_per_second": 2.324, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 2.109248090434719, | |
| "epoch": 1.0530503978779842, | |
| "grad_norm": 1.6356357336044312, | |
| "learning_rate": 0.0004132932107384442, | |
| "loss": 2.0461, | |
| "mean_token_accuracy": 0.5923033171408886, | |
| "num_tokens": 22568.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.6169874876737595, | |
| "epoch": 1.1591511936339522, | |
| "grad_norm": 3.5763280391693115, | |
| "learning_rate": 0.00038889255825490053, | |
| "loss": 1.6699, | |
| "mean_token_accuracy": 0.6313242256641388, | |
| "num_tokens": 24985.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.4064562141895294, | |
| "epoch": 1.2652519893899203, | |
| "grad_norm": 2.4848248958587646, | |
| "learning_rate": 0.0003624028324136517, | |
| "loss": 1.3988, | |
| "mean_token_accuracy": 0.6858954817056656, | |
| "num_tokens": 27037.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5654033362865447, | |
| "epoch": 1.3713527851458887, | |
| "grad_norm": 2.5782299041748047, | |
| "learning_rate": 0.00033422246334805503, | |
| "loss": 1.6258, | |
| "mean_token_accuracy": 0.6557254463434219, | |
| "num_tokens": 29468.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.4926183179020882, | |
| "epoch": 1.4774535809018567, | |
| "grad_norm": 2.629681348800659, | |
| "learning_rate": 0.0003047753100392174, | |
| "loss": 1.5027, | |
| "mean_token_accuracy": 0.6912050604820251, | |
| "num_tokens": 31213.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.383179245889187, | |
| "epoch": 1.5835543766578248, | |
| "grad_norm": 2.4897210597991943, | |
| "learning_rate": 0.0002745042850823902, | |
| "loss": 1.5551, | |
| "mean_token_accuracy": 0.6832514323294163, | |
| "num_tokens": 33509.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.6858038201928138, | |
| "epoch": 1.6896551724137931, | |
| "grad_norm": 2.9458775520324707, | |
| "learning_rate": 0.00024386469286927196, | |
| "loss": 1.6896, | |
| "mean_token_accuracy": 0.651208619773388, | |
| "num_tokens": 35740.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.3946410089731216, | |
| "epoch": 1.7957559681697612, | |
| "grad_norm": 2.711728572845459, | |
| "learning_rate": 0.00021331738138615958, | |
| "loss": 1.3121, | |
| "mean_token_accuracy": 0.6985804051160812, | |
| "num_tokens": 37788.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.492286352813244, | |
| "epoch": 1.9018567639257293, | |
| "grad_norm": 2.2474756240844727, | |
| "learning_rate": 0.00018332181063127542, | |
| "loss": 1.5417, | |
| "mean_token_accuracy": 0.6719372659921646, | |
| "num_tokens": 40106.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.45025778461147, | |
| "epoch": 2.0, | |
| "grad_norm": 4.529512405395508, | |
| "learning_rate": 0.00015432914190872756, | |
| "loss": 1.5463, | |
| "mean_token_accuracy": 0.6596984573312707, | |
| "num_tokens": 42190.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 1.8587926966803414, | |
| "eval_loss": 2.3478477001190186, | |
| "eval_mean_token_accuracy": 0.5531174611477625, | |
| "eval_num_tokens": 42190.0, | |
| "eval_runtime": 18.1194, | |
| "eval_samples_per_second": 2.318, | |
| "eval_steps_per_second": 2.318, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.9160950664430857, | |
| "epoch": 2.1061007957559683, | |
| "grad_norm": 2.8789069652557373, | |
| "learning_rate": 0.00012677545194255402, | |
| "loss": 0.7246, | |
| "mean_token_accuracy": 0.8295891240239144, | |
| "num_tokens": 44864.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.7227096475660801, | |
| "epoch": 2.212201591511936, | |
| "grad_norm": 2.2743287086486816, | |
| "learning_rate": 0.00010107517387689166, | |
| "loss": 0.7039, | |
| "mean_token_accuracy": 0.8215794518589974, | |
| "num_tokens": 47199.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.6997399874031544, | |
| "epoch": 2.3183023872679045, | |
| "grad_norm": 2.3004889488220215, | |
| "learning_rate": 7.761486381573326e-05, | |
| "loss": 0.6166, | |
| "mean_token_accuracy": 0.8439405784010887, | |
| "num_tokens": 49392.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.6859173461794853, | |
| "epoch": 2.424403183023873, | |
| "grad_norm": 1.3413803577423096, | |
| "learning_rate": 5.6747386659315755e-05, | |
| "loss": 0.6388, | |
| "mean_token_accuracy": 0.8292661786079407, | |
| "num_tokens": 51499.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.6497876241803169, | |
| "epoch": 2.5305039787798407, | |
| "grad_norm": 3.4095118045806885, | |
| "learning_rate": 3.878660868757322e-05, | |
| "loss": 0.7073, | |
| "mean_token_accuracy": 0.827797320485115, | |
| "num_tokens": 53730.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.6810318753123283, | |
| "epoch": 2.636604774535809, | |
| "grad_norm": 1.5059446096420288, | |
| "learning_rate": 2.4002676719139166e-05, | |
| "loss": 0.6202, | |
| "mean_token_accuracy": 0.8435622423887252, | |
| "num_tokens": 55788.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.6908333510160446, | |
| "epoch": 2.7427055702917773, | |
| "grad_norm": 2.3477425575256348, | |
| "learning_rate": 1.2617954851740832e-05, | |
| "loss": 0.5653, | |
| "mean_token_accuracy": 0.8467133089900016, | |
| "num_tokens": 57735.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.6482014119625091, | |
| "epoch": 2.8488063660477456, | |
| "grad_norm": 1.8979235887527466, | |
| "learning_rate": 4.803679899192393e-06, | |
| "loss": 0.6111, | |
| "mean_token_accuracy": 0.8416899383068085, | |
| "num_tokens": 60064.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.6196089655160903, | |
| "epoch": 2.9549071618037135, | |
| "grad_norm": 2.2915163040161133, | |
| "learning_rate": 6.773858303274482e-07, | |
| "loss": 0.6357, | |
| "mean_token_accuracy": 0.8439681366086006, | |
| "num_tokens": 62293.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_entropy": 1.2214261406943911, | |
| "eval_loss": 2.704627752304077, | |
| "eval_mean_token_accuracy": 0.554165651400884, | |
| "eval_num_tokens": 63285.0, | |
| "eval_runtime": 18.1229, | |
| "eval_samples_per_second": 2.318, | |
| "eval_steps_per_second": 2.318, | |
| "step": 285 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 285, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1429848321131520.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |