| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9937888198757764, | |
| "eval_steps": 500, | |
| "global_step": 40, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.024844720496894408, | |
| "grad_norm": 0.22360646249577132, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3155, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.049689440993788817, | |
| "grad_norm": 1.6354713380395525, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4894, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.07453416149068323, | |
| "grad_norm": 21.686343056691, | |
| "learning_rate": 0.0001, | |
| "loss": 5.1274, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.09937888198757763, | |
| "grad_norm": 14.478086289344557, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6788, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.12422360248447205, | |
| "grad_norm": 10.40331474659814, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7381, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.14906832298136646, | |
| "grad_norm": 14.828952914633255, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0551, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 1.0287292990470505, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4616, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.19875776397515527, | |
| "grad_norm": 4.424881730530812, | |
| "learning_rate": 0.0001, | |
| "loss": 0.7389, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.2236024844720497, | |
| "grad_norm": 1.5788138903378481, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5233, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.2484472049689441, | |
| "grad_norm": 1.6155451789191035, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4922, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2732919254658385, | |
| "grad_norm": 53.16385537396071, | |
| "learning_rate": 0.0001, | |
| "loss": 1.0959, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2981366459627329, | |
| "grad_norm": 2.0002784882087092, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6879, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.32298136645962733, | |
| "grad_norm": 1.1683199154602835, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4369, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 1.033205019062668, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3917, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.37267080745341613, | |
| "grad_norm": 0.4412023447127328, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3448, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.39751552795031053, | |
| "grad_norm": 0.9578202521179224, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3459, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.422360248447205, | |
| "grad_norm": 2.289275193987449, | |
| "learning_rate": 0.0001, | |
| "loss": 0.534, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.4472049689440994, | |
| "grad_norm": 0.6666094964330221, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3535, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4720496894409938, | |
| "grad_norm": 0.7930206978533701, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3496, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.4968944099378882, | |
| "grad_norm": 0.3055608278925833, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3339, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 0.29784796389686363, | |
| "learning_rate": 0.0001, | |
| "loss": 0.31, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.546583850931677, | |
| "grad_norm": 0.21503207187750886, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2922, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.15247931417135197, | |
| "learning_rate": 0.0001, | |
| "loss": 0.292, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5962732919254659, | |
| "grad_norm": 0.19200002221931217, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2937, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.6211180124223602, | |
| "grad_norm": 0.12937354448998015, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2701, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.6459627329192547, | |
| "grad_norm": 0.7041267128561529, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2806, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.6708074534161491, | |
| "grad_norm": 0.1703017837650162, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2733, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 0.1670950080367148, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2691, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.7204968944099379, | |
| "grad_norm": 0.15191029267235273, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2611, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.7453416149068323, | |
| "grad_norm": 0.10352109095733353, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2764, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7701863354037267, | |
| "grad_norm": 0.11621572352545091, | |
| "learning_rate": 0.0001, | |
| "loss": 0.259, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.7950310559006211, | |
| "grad_norm": 0.1047835528239364, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2507, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.8198757763975155, | |
| "grad_norm": 0.10515270423855971, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2743, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.84472049689441, | |
| "grad_norm": 0.05663550910043551, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2565, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.09940447058540708, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2558, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.8944099378881988, | |
| "grad_norm": 0.09817461399434704, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2483, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.9192546583850931, | |
| "grad_norm": 0.06027821970777435, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2516, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.9440993788819876, | |
| "grad_norm": 0.08188481055174467, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2305, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.968944099378882, | |
| "grad_norm": 0.09993015738582685, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2559, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.9937888198757764, | |
| "grad_norm": 0.09361624235588561, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2556, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9937888198757764, | |
| "step": 40, | |
| "total_flos": 55331389440000.0, | |
| "train_loss": 0.5212820250540972, | |
| "train_runtime": 2252.6119, | |
| "train_samples_per_second": 3.42, | |
| "train_steps_per_second": 0.018 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 40, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 55331389440000.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |