| { | |
| "best_metric": 7.342555999755859, | |
| "best_model_checkpoint": "/data1/attanasiog/babylm/roberta-tiny-8l-10M/checkpoint-700", | |
| "epoch": 17.698779704560053, | |
| "global_step": 850, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.21, | |
| "learning_rate": 8e-05, | |
| "loss": 10.2998, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "learning_rate": 0.00016, | |
| "loss": 8.9979, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "learning_rate": 0.00024, | |
| "loss": 7.8015, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "learning_rate": 0.00032, | |
| "loss": 7.3376, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 0.0004, | |
| "loss": 7.8102, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_accuracy": 0.05136765891155645, | |
| "eval_loss": 7.374657154083252, | |
| "eval_runtime": 180.8913, | |
| "eval_samples_per_second": 132.98, | |
| "eval_steps_per_second": 4.157, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "learning_rate": 0.000399995625676045, | |
| "loss": 7.3419, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "learning_rate": 0.0003999825028955268, | |
| "loss": 8.1652, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "learning_rate": 0.0003999606322324786, | |
| "loss": 8.4182, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "learning_rate": 0.0003999300146435939, | |
| "loss": 7.3249, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "learning_rate": 0.00039989065146818525, | |
| "loss": 7.805, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_accuracy": 0.051684268514440884, | |
| "eval_loss": 7.369903087615967, | |
| "eval_runtime": 181.0798, | |
| "eval_samples_per_second": 132.842, | |
| "eval_steps_per_second": 4.153, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "learning_rate": 0.0003998425444281255, | |
| "loss": 7.3101, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "learning_rate": 0.00039978569562777234, | |
| "loss": 7.3232, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "learning_rate": 0.0003997201075538765, | |
| "loss": 7.3073, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "learning_rate": 0.0003996457830754729, | |
| "loss": 7.3236, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "learning_rate": 0.00039956272544375493, | |
| "loss": 7.7907, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "eval_accuracy": 0.05174263561361906, | |
| "eval_loss": 7.35952091217041, | |
| "eval_runtime": 180.7769, | |
| "eval_samples_per_second": 133.065, | |
| "eval_steps_per_second": 4.16, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "learning_rate": 0.00039947093829193245, | |
| "loss": 7.2981, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "learning_rate": 0.00039937042563507283, | |
| "loss": 7.3259, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "learning_rate": 0.00039926119186992537, | |
| "loss": 7.3352, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "learning_rate": 0.0003991432417747288, | |
| "loss": 7.3069, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "learning_rate": 0.0003990165805090023, | |
| "loss": 7.7838, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "eval_accuracy": 0.05138188801155976, | |
| "eval_loss": 7.361721992492676, | |
| "eval_runtime": 180.6907, | |
| "eval_samples_per_second": 133.128, | |
| "eval_steps_per_second": 4.162, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "learning_rate": 0.00039888121361332003, | |
| "loss": 7.3066, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "learning_rate": 0.0003987371470090686, | |
| "loss": 7.3237, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "learning_rate": 0.00039858438699818784, | |
| "loss": 7.3209, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "learning_rate": 0.0003984229402628956, | |
| "loss": 7.3024, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.21, | |
| "learning_rate": 0.00039825281386539503, | |
| "loss": 7.7706, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.21, | |
| "eval_accuracy": 0.05140231728427503, | |
| "eval_loss": 7.358623504638672, | |
| "eval_runtime": 180.7786, | |
| "eval_samples_per_second": 133.063, | |
| "eval_steps_per_second": 4.16, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.41, | |
| "learning_rate": 0.000398074015247566, | |
| "loss": 7.3135, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.62, | |
| "learning_rate": 0.0003978865522306392, | |
| "loss": 7.3003, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.82, | |
| "learning_rate": 0.0003976904330148543, | |
| "loss": 7.3159, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.04, | |
| "learning_rate": 0.00039748566617910113, | |
| "loss": 7.7967, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "learning_rate": 0.0003972722606805445, | |
| "loss": 7.2933, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "eval_accuracy": 0.05126180317018771, | |
| "eval_loss": 7.356584548950195, | |
| "eval_runtime": 180.7497, | |
| "eval_samples_per_second": 133.085, | |
| "eval_steps_per_second": 4.16, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.45, | |
| "learning_rate": 0.00039705022585423216, | |
| "loss": 7.3163, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.66, | |
| "learning_rate": 0.0003968195714126868, | |
| "loss": 7.2904, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.86, | |
| "learning_rate": 0.00039658030744548075, | |
| "loss": 7.3045, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "learning_rate": 0.0003963324444187952, | |
| "loss": 7.7849, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 7.29, | |
| "learning_rate": 0.0003960759931749619, | |
| "loss": 7.2932, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.29, | |
| "eval_accuracy": 0.05161072401384023, | |
| "eval_loss": 7.3526611328125, | |
| "eval_runtime": 180.6553, | |
| "eval_samples_per_second": 133.154, | |
| "eval_steps_per_second": 4.163, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.49, | |
| "learning_rate": 0.00039581096493198893, | |
| "loss": 7.3057, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.7, | |
| "learning_rate": 0.0003955373712830703, | |
| "loss": 7.3002, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 7.9, | |
| "learning_rate": 0.00039525522419607854, | |
| "loss": 7.3029, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 8.12, | |
| "learning_rate": 0.0003949645360130412, | |
| "loss": 7.7765, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "learning_rate": 0.0003946653194496012, | |
| "loss": 7.2986, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.33, | |
| "eval_accuracy": 0.051572554180051966, | |
| "eval_loss": 7.356107234954834, | |
| "eval_runtime": 180.5938, | |
| "eval_samples_per_second": 133.199, | |
| "eval_steps_per_second": 4.164, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.53, | |
| "learning_rate": 0.00039435758759446025, | |
| "loss": 7.3093, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 8.74, | |
| "learning_rate": 0.00039404135390880664, | |
| "loss": 7.294, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "learning_rate": 0.0003937166322257262, | |
| "loss": 7.3083, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "learning_rate": 0.00039338343674959745, | |
| "loss": 7.7912, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 9.37, | |
| "learning_rate": 0.00039304178205546976, | |
| "loss": 7.289, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.37, | |
| "eval_accuracy": 0.05145224079666028, | |
| "eval_loss": 7.34950590133667, | |
| "eval_runtime": 180.7201, | |
| "eval_samples_per_second": 133.106, | |
| "eval_steps_per_second": 4.161, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.58, | |
| "learning_rate": 0.00039269168308842634, | |
| "loss": 7.3004, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.78, | |
| "learning_rate": 0.00039233315516293006, | |
| "loss": 7.2938, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 9.99, | |
| "learning_rate": 0.00039196621396215403, | |
| "loss": 7.2897, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 10.21, | |
| "learning_rate": 0.000391590875537295, | |
| "loss": 7.7652, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 10.41, | |
| "learning_rate": 0.00039120715630687155, | |
| "loss": 7.2879, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 10.41, | |
| "eval_accuracy": 0.05138556381472711, | |
| "eval_loss": 7.3455071449279785, | |
| "eval_runtime": 180.6339, | |
| "eval_samples_per_second": 133.17, | |
| "eval_steps_per_second": 4.163, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 10.62, | |
| "learning_rate": 0.000390815073056006, | |
| "loss": 7.2942, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 10.82, | |
| "learning_rate": 0.00039041464293568983, | |
| "loss": 7.306, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 11.04, | |
| "learning_rate": 0.00039000588346203374, | |
| "loss": 7.7754, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 11.25, | |
| "learning_rate": 0.0003895888125155014, | |
| "loss": 7.2912, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 11.45, | |
| "learning_rate": 0.00038916344834012695, | |
| "loss": 7.276, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 11.45, | |
| "eval_accuracy": 0.05130612004196204, | |
| "eval_loss": 7.347738265991211, | |
| "eval_runtime": 180.7636, | |
| "eval_samples_per_second": 133.074, | |
| "eval_steps_per_second": 4.16, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 11.66, | |
| "learning_rate": 0.00038872980954271757, | |
| "loss": 7.3135, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 11.86, | |
| "learning_rate": 0.00038828791509203895, | |
| "loss": 7.2859, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 12.08, | |
| "learning_rate": 0.00038783778431798597, | |
| "loss": 7.7845, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 12.29, | |
| "learning_rate": 0.0003873794369107369, | |
| "loss": 7.2966, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 12.49, | |
| "learning_rate": 0.0003869128929198922, | |
| "loss": 7.3072, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 12.49, | |
| "eval_accuracy": 0.051627819878485845, | |
| "eval_loss": 7.344621658325195, | |
| "eval_runtime": 180.6519, | |
| "eval_samples_per_second": 133.157, | |
| "eval_steps_per_second": 4.163, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 12.7, | |
| "learning_rate": 0.0003864381727535973, | |
| "loss": 7.3026, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 12.9, | |
| "learning_rate": 0.00038595529717765027, | |
| "loss": 7.2966, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 13.12, | |
| "learning_rate": 0.0003854642873145931, | |
| "loss": 7.7848, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 13.33, | |
| "learning_rate": 0.00038496516464278776, | |
| "loss": 7.2964, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 13.53, | |
| "learning_rate": 0.00038445795099547697, | |
| "loss": 7.2978, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 13.53, | |
| "eval_accuracy": 0.05143096217098587, | |
| "eval_loss": 7.346319198608398, | |
| "eval_runtime": 180.763, | |
| "eval_samples_per_second": 133.075, | |
| "eval_steps_per_second": 4.16, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 13.74, | |
| "learning_rate": 0.0003839426685598287, | |
| "loss": 7.2919, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 13.95, | |
| "learning_rate": 0.000383419339875966, | |
| "loss": 7.3006, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 14.16, | |
| "learning_rate": 0.00038288798783598087, | |
| "loss": 7.7738, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 14.37, | |
| "learning_rate": 0.0003823486356829329, | |
| "loss": 7.2839, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 14.58, | |
| "learning_rate": 0.0003818013070098325, | |
| "loss": 7.2857, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 14.58, | |
| "eval_accuracy": 0.05146984844436126, | |
| "eval_loss": 7.342555999755859, | |
| "eval_runtime": 180.8063, | |
| "eval_samples_per_second": 133.043, | |
| "eval_steps_per_second": 4.159, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 14.78, | |
| "learning_rate": 0.0003812460257586089, | |
| "loss": 7.2949, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 14.99, | |
| "learning_rate": 0.000380682816219063, | |
| "loss": 7.3249, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 15.21, | |
| "learning_rate": 0.00038011170302780446, | |
| "loss": 7.7486, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 15.41, | |
| "learning_rate": 0.00037953271116717444, | |
| "loss": 7.2879, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 15.62, | |
| "learning_rate": 0.0003789458659641527, | |
| "loss": 7.2868, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 15.62, | |
| "eval_accuracy": 0.05147383671825258, | |
| "eval_loss": 7.343778610229492, | |
| "eval_runtime": 180.8254, | |
| "eval_samples_per_second": 133.029, | |
| "eval_steps_per_second": 4.159, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 15.82, | |
| "learning_rate": 0.0003783511930892495, | |
| "loss": 7.2986, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 16.04, | |
| "learning_rate": 0.00037774871855538275, | |
| "loss": 7.7788, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 16.25, | |
| "learning_rate": 0.00037713846871674045, | |
| "loss": 7.2858, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 16.45, | |
| "learning_rate": 0.0003765204702676274, | |
| "loss": 7.2937, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 16.66, | |
| "learning_rate": 0.0003758947502412978, | |
| "loss": 7.2973, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 16.66, | |
| "eval_accuracy": 0.051658592501666364, | |
| "eval_loss": 7.344185829162598, | |
| "eval_runtime": 180.7375, | |
| "eval_samples_per_second": 133.094, | |
| "eval_steps_per_second": 4.161, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 16.86, | |
| "learning_rate": 0.0003752613360087727, | |
| "loss": 7.3043, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 17.08, | |
| "learning_rate": 0.00037462025527764265, | |
| "loss": 7.7616, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 17.29, | |
| "learning_rate": 0.00037397153609085553, | |
| "loss": 7.2869, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 17.49, | |
| "learning_rate": 0.0003733152068254901, | |
| "loss": 7.2798, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 17.7, | |
| "learning_rate": 0.00037265129619151483, | |
| "loss": 7.2988, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 17.7, | |
| "eval_accuracy": 0.051239018394020945, | |
| "eval_loss": 7.343734264373779, | |
| "eval_runtime": 180.5675, | |
| "eval_samples_per_second": 133.219, | |
| "eval_steps_per_second": 4.165, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 17.7, | |
| "step": 850, | |
| "total_flos": 1.1524171581514752e+17, | |
| "train_loss": 7.482659651812385, | |
| "train_runtime": 11122.8848, | |
| "train_samples_per_second": 223.953, | |
| "train_steps_per_second": 0.432 | |
| } | |
| ], | |
| "max_steps": 4800, | |
| "num_train_epochs": 100, | |
| "total_flos": 1.1524171581514752e+17, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |