{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01756543123133673, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003513086246267346, "grad_norm": 13.209701538085938, "learning_rate": 2e-05, "loss": 15.7011, "step": 1 }, { "epoch": 0.0003513086246267346, "eval_loss": 3.0710694789886475, "eval_runtime": 35.205, "eval_samples_per_second": 34.058, "eval_steps_per_second": 17.043, "step": 1 }, { "epoch": 0.0007026172492534692, "grad_norm": 7.19278621673584, "learning_rate": 4e-05, "loss": 10.6803, "step": 2 }, { "epoch": 0.0010539258738802037, "grad_norm": 9.121259689331055, "learning_rate": 6e-05, "loss": 12.6325, "step": 3 }, { "epoch": 0.0014052344985069384, "grad_norm": 9.981005668640137, "learning_rate": 8e-05, "loss": 12.9636, "step": 4 }, { "epoch": 0.001756543123133673, "grad_norm": 8.011913299560547, "learning_rate": 0.0001, "loss": 10.0169, "step": 5 }, { "epoch": 0.0021078517477604074, "grad_norm": 9.863301277160645, "learning_rate": 0.00012, "loss": 13.4675, "step": 6 }, { "epoch": 0.002459160372387142, "grad_norm": 11.878751754760742, "learning_rate": 0.00014, "loss": 11.43, "step": 7 }, { "epoch": 0.002810468997013877, "grad_norm": 12.1919584274292, "learning_rate": 0.00016, "loss": 10.2991, "step": 8 }, { "epoch": 0.003161777621640611, "grad_norm": 21.708755493164062, "learning_rate": 0.00018, "loss": 14.5634, "step": 9 }, { "epoch": 0.003513086246267346, "grad_norm": 17.5557918548584, "learning_rate": 0.0002, "loss": 9.602, "step": 10 }, { "epoch": 0.0038643948708940805, "grad_norm": 8.21631145477295, "learning_rate": 0.0001996917333733128, "loss": 8.3579, "step": 11 }, { "epoch": 0.004215703495520815, "grad_norm": 11.151665687561035, "learning_rate": 0.00019876883405951377, "loss": 9.0388, "step": 12 }, { "epoch": 0.0045670121201475495, "grad_norm": 9.590779304504395, "learning_rate": 0.00019723699203976766, "loss": 9.0038, "step": 13 }, { "epoch": 0.0045670121201475495, "eval_loss": 1.7963718175888062, "eval_runtime": 35.2425, "eval_samples_per_second": 34.021, "eval_steps_per_second": 17.025, "step": 13 }, { "epoch": 0.004918320744774284, "grad_norm": 9.898582458496094, "learning_rate": 0.00019510565162951537, "loss": 8.8708, "step": 14 }, { "epoch": 0.005269629369401019, "grad_norm": 8.537947654724121, "learning_rate": 0.0001923879532511287, "loss": 7.3514, "step": 15 }, { "epoch": 0.005620937994027754, "grad_norm": 9.073677062988281, "learning_rate": 0.0001891006524188368, "loss": 7.0456, "step": 16 }, { "epoch": 0.005972246618654488, "grad_norm": 8.133380889892578, "learning_rate": 0.00018526401643540922, "loss": 5.0461, "step": 17 }, { "epoch": 0.006323555243281222, "grad_norm": 6.910238265991211, "learning_rate": 0.00018090169943749476, "loss": 6.6536, "step": 18 }, { "epoch": 0.006674863867907957, "grad_norm": 6.8287739753723145, "learning_rate": 0.0001760405965600031, "loss": 5.3598, "step": 19 }, { "epoch": 0.007026172492534692, "grad_norm": 9.288922309875488, "learning_rate": 0.00017071067811865476, "loss": 8.0084, "step": 20 }, { "epoch": 0.007377481117161426, "grad_norm": 9.96381664276123, "learning_rate": 0.00016494480483301836, "loss": 6.747, "step": 21 }, { "epoch": 0.007728789741788161, "grad_norm": 6.746928691864014, "learning_rate": 0.00015877852522924732, "loss": 5.2333, "step": 22 }, { "epoch": 0.008080098366414896, "grad_norm": 7.395442008972168, "learning_rate": 0.0001522498564715949, "loss": 5.7944, "step": 23 }, { "epoch": 0.00843140699104163, "grad_norm": 7.33457612991333, "learning_rate": 0.00014539904997395468, "loss": 5.6282, "step": 24 }, { "epoch": 0.008782715615668365, "grad_norm": 8.562402725219727, "learning_rate": 0.000138268343236509, "loss": 7.6235, "step": 25 }, { "epoch": 0.009134024240295099, "grad_norm": 12.655267715454102, "learning_rate": 0.00013090169943749476, "loss": 6.9623, "step": 26 }, { "epoch": 0.009134024240295099, "eval_loss": 1.356201171875, "eval_runtime": 35.2994, "eval_samples_per_second": 33.967, "eval_steps_per_second": 16.997, "step": 26 }, { "epoch": 0.009485332864921835, "grad_norm": 8.467972755432129, "learning_rate": 0.00012334453638559057, "loss": 6.425, "step": 27 }, { "epoch": 0.009836641489548568, "grad_norm": 8.868040084838867, "learning_rate": 0.0001156434465040231, "loss": 5.7143, "step": 28 }, { "epoch": 0.010187950114175302, "grad_norm": 10.619840621948242, "learning_rate": 0.0001078459095727845, "loss": 6.4041, "step": 29 }, { "epoch": 0.010539258738802038, "grad_norm": 7.026447296142578, "learning_rate": 0.0001, "loss": 4.5627, "step": 30 }, { "epoch": 0.010890567363428772, "grad_norm": 7.874845027923584, "learning_rate": 9.215409042721552e-05, "loss": 5.0585, "step": 31 }, { "epoch": 0.011241875988055507, "grad_norm": 9.07471752166748, "learning_rate": 8.435655349597689e-05, "loss": 5.7875, "step": 32 }, { "epoch": 0.011593184612682241, "grad_norm": 6.390406131744385, "learning_rate": 7.66554636144095e-05, "loss": 3.3577, "step": 33 }, { "epoch": 0.011944493237308977, "grad_norm": 6.924739837646484, "learning_rate": 6.909830056250527e-05, "loss": 5.4426, "step": 34 }, { "epoch": 0.01229580186193571, "grad_norm": 6.957950592041016, "learning_rate": 6.173165676349103e-05, "loss": 5.5254, "step": 35 }, { "epoch": 0.012647110486562444, "grad_norm": 7.8153228759765625, "learning_rate": 5.4600950026045326e-05, "loss": 5.865, "step": 36 }, { "epoch": 0.01299841911118918, "grad_norm": 7.004741668701172, "learning_rate": 4.7750143528405126e-05, "loss": 5.6366, "step": 37 }, { "epoch": 0.013349727735815914, "grad_norm": 6.407052040100098, "learning_rate": 4.12214747707527e-05, "loss": 4.4729, "step": 38 }, { "epoch": 0.01370103636044265, "grad_norm": 6.333763599395752, "learning_rate": 3.5055195166981645e-05, "loss": 5.7312, "step": 39 }, { "epoch": 0.01370103636044265, "eval_loss": 1.3001857995986938, "eval_runtime": 35.3021, "eval_samples_per_second": 33.964, "eval_steps_per_second": 16.996, "step": 39 }, { "epoch": 0.014052344985069383, "grad_norm": 9.61929988861084, "learning_rate": 2.9289321881345254e-05, "loss": 5.2785, "step": 40 }, { "epoch": 0.014403653609696119, "grad_norm": 13.948427200317383, "learning_rate": 2.3959403439996907e-05, "loss": 5.2793, "step": 41 }, { "epoch": 0.014754962234322853, "grad_norm": 7.098333358764648, "learning_rate": 1.9098300562505266e-05, "loss": 4.5026, "step": 42 }, { "epoch": 0.015106270858949587, "grad_norm": 7.2549662590026855, "learning_rate": 1.4735983564590783e-05, "loss": 5.5954, "step": 43 }, { "epoch": 0.015457579483576322, "grad_norm": 7.662211894989014, "learning_rate": 1.0899347581163221e-05, "loss": 5.1637, "step": 44 }, { "epoch": 0.015808888108203058, "grad_norm": 6.767326354980469, "learning_rate": 7.612046748871327e-06, "loss": 5.5622, "step": 45 }, { "epoch": 0.01616019673282979, "grad_norm": 8.104708671569824, "learning_rate": 4.8943483704846475e-06, "loss": 5.4812, "step": 46 }, { "epoch": 0.016511505357456525, "grad_norm": 6.239536285400391, "learning_rate": 2.7630079602323442e-06, "loss": 4.0382, "step": 47 }, { "epoch": 0.01686281398208326, "grad_norm": 7.268672943115234, "learning_rate": 1.231165940486234e-06, "loss": 5.3905, "step": 48 }, { "epoch": 0.017214122606709993, "grad_norm": 8.06124210357666, "learning_rate": 3.0826662668720364e-07, "loss": 4.5163, "step": 49 }, { "epoch": 0.01756543123133673, "grad_norm": 6.5516676902771, "learning_rate": 0.0, "loss": 6.1173, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4589766888652800.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }