| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0025662959794697, | |
| "eval_steps": 147, | |
| "global_step": 293, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003421727972626176, | |
| "eval_loss": 4.355165481567383, | |
| "eval_runtime": 52.6403, | |
| "eval_samples_per_second": 18.712, | |
| "eval_steps_per_second": 1.178, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03421727972626176, | |
| "grad_norm": 4.47415018081665, | |
| "learning_rate": 2e-05, | |
| "loss": 4.2505, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06843455945252352, | |
| "grad_norm": 1.6644771099090576, | |
| "learning_rate": 4e-05, | |
| "loss": 3.3993, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10265183917878529, | |
| "grad_norm": 1.3752275705337524, | |
| "learning_rate": 6e-05, | |
| "loss": 2.8647, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13686911890504705, | |
| "grad_norm": 1.3559530973434448, | |
| "learning_rate": 8e-05, | |
| "loss": 2.6811, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1710863986313088, | |
| "grad_norm": 1.0810866355895996, | |
| "learning_rate": 0.0001, | |
| "loss": 2.5526, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20530367835757057, | |
| "grad_norm": 1.128179669380188, | |
| "learning_rate": 0.00012, | |
| "loss": 2.5003, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.23952095808383234, | |
| "grad_norm": 1.2589479684829712, | |
| "learning_rate": 0.00014, | |
| "loss": 2.421, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2737382378100941, | |
| "grad_norm": 1.1778737306594849, | |
| "learning_rate": 0.00016, | |
| "loss": 2.3997, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.30795551753635586, | |
| "grad_norm": 1.136731743812561, | |
| "learning_rate": 0.00018, | |
| "loss": 2.3794, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3421727972626176, | |
| "grad_norm": 1.1050846576690674, | |
| "learning_rate": 0.0002, | |
| "loss": 2.339, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3763900769888794, | |
| "grad_norm": 1.1404505968093872, | |
| "learning_rate": 0.0002, | |
| "loss": 2.3492, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41060735671514115, | |
| "grad_norm": 1.0500866174697876, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2997, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4448246364414029, | |
| "grad_norm": 1.2117984294891357, | |
| "learning_rate": 0.0002, | |
| "loss": 2.325, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.47904191616766467, | |
| "grad_norm": 1.127273440361023, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2696, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5029940119760479, | |
| "eval_loss": 2.2770884037017822, | |
| "eval_runtime": 53.1905, | |
| "eval_samples_per_second": 18.518, | |
| "eval_steps_per_second": 1.166, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5132591958939264, | |
| "grad_norm": 1.189348578453064, | |
| "learning_rate": 0.0002, | |
| "loss": 2.3341, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5474764756201882, | |
| "grad_norm": 1.0971434116363525, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2627, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.58169375534645, | |
| "grad_norm": 1.0656123161315918, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2739, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6159110350727117, | |
| "grad_norm": 1.1617597341537476, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2446, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6501283147989735, | |
| "grad_norm": 1.1609177589416504, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2269, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6843455945252352, | |
| "grad_norm": 1.0725802183151245, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2471, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.718562874251497, | |
| "grad_norm": 1.1435920000076294, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2892, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7527801539777588, | |
| "grad_norm": 1.1242313385009766, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2795, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7869974337040205, | |
| "grad_norm": 1.0565266609191895, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2515, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8212147134302823, | |
| "grad_norm": 1.1212342977523804, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2805, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8554319931565441, | |
| "grad_norm": 1.124230146408081, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2378, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8896492728828058, | |
| "grad_norm": 1.0999276638031006, | |
| "learning_rate": 0.0002, | |
| "loss": 2.222, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9238665526090676, | |
| "grad_norm": 1.1538196802139282, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2267, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9580838323353293, | |
| "grad_norm": 1.2899906635284424, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2384, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9923011120615911, | |
| "grad_norm": 1.3248995542526245, | |
| "learning_rate": 0.0002, | |
| "loss": 2.236, | |
| "step": 290 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 293, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 147, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.826112117351383e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |