{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9714285714285715, "eval_steps": 500, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01904761904761905, "grad_norm": 1710495.2036469786, "learning_rate": 0.0, "loss": 1.3566, "memory/device_reserved (GiB)": 126.71, "memory/max_active (GiB)": 124.13, "memory/max_allocated (GiB)": 122.77, "step": 1, "tokens_per_second_per_gpu": 3497.88 }, { "epoch": 0.0380952380952381, "grad_norm": 1558119.299961758, "learning_rate": 8e-07, "loss": 1.3538, "memory/device_reserved (GiB)": 126.73, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 2, "tokens_per_second_per_gpu": 3711.34 }, { "epoch": 0.05714285714285714, "grad_norm": 4186.518498313145, "learning_rate": 1.6e-06, "loss": 1.3529, "memory/device_reserved (GiB)": 126.73, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 3, "tokens_per_second_per_gpu": 3869.09 }, { "epoch": 0.0761904761904762, "grad_norm": 1230.1393406412694, "learning_rate": 2.4e-06, "loss": 1.3622, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.96, "memory/max_allocated (GiB)": 122.82, "step": 4, "tokens_per_second_per_gpu": 3629.53 }, { "epoch": 0.09523809523809523, "grad_norm": 1035.6723923215748, "learning_rate": 3.2e-06, "loss": 1.3487, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 5, "tokens_per_second_per_gpu": 3634.6 }, { "epoch": 0.11428571428571428, "grad_norm": 432.5460621726683, "learning_rate": 4e-06, "loss": 1.3432, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 6, "tokens_per_second_per_gpu": 3776.49 }, { "epoch": 0.13333333333333333, "grad_norm": 1638.0331848931094, "learning_rate": 4.8e-06, "loss": 1.3677, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 7, "tokens_per_second_per_gpu": 3647.88 }, { "epoch": 0.1523809523809524, "grad_norm": 2883.046779503214, "learning_rate": 5.6e-06, "loss": 1.3444, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 8, "tokens_per_second_per_gpu": 3677.14 }, { "epoch": 0.17142857142857143, "grad_norm": 478.1216745871938, "learning_rate": 6.4e-06, "loss": 1.3305, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 9, "tokens_per_second_per_gpu": 3739.63 }, { "epoch": 0.19047619047619047, "grad_norm": 1025.7505155071237, "learning_rate": 7.2e-06, "loss": 1.3362, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 10, "tokens_per_second_per_gpu": 3730.56 }, { "epoch": 0.20952380952380953, "grad_norm": 1209.6274892436668, "learning_rate": 8e-06, "loss": 1.3325, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 11, "tokens_per_second_per_gpu": 3661.21 }, { "epoch": 0.22857142857142856, "grad_norm": 1213.936189837833, "learning_rate": 7.997766254921018e-06, "loss": 1.3575, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 12, "tokens_per_second_per_gpu": 3716.18 }, { "epoch": 0.24761904761904763, "grad_norm": 942.8786617202861, "learning_rate": 7.991067514492613e-06, "loss": 1.3145, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 13, "tokens_per_second_per_gpu": 3587.32 }, { "epoch": 0.26666666666666666, "grad_norm": 3603.68277269405, "learning_rate": 7.979911260354016e-06, "loss": 1.3402, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 14, "tokens_per_second_per_gpu": 3712.99 }, { "epoch": 0.2857142857142857, "grad_norm": 2487.402838754216, "learning_rate": 7.96430995261912e-06, "loss": 1.2956, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 15, "tokens_per_second_per_gpu": 3762.61 }, { "epoch": 0.3047619047619048, "grad_norm": 667.5903281250161, "learning_rate": 7.944281015960114e-06, "loss": 1.2992, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 16, "tokens_per_second_per_gpu": 3358.5 }, { "epoch": 0.3238095238095238, "grad_norm": 167.9027323688511, "learning_rate": 7.919846820146347e-06, "loss": 1.3119, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 17, "tokens_per_second_per_gpu": 3675.6 }, { "epoch": 0.34285714285714286, "grad_norm": 47.46189855084341, "learning_rate": 7.891034655060149e-06, "loss": 1.302, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 18, "tokens_per_second_per_gpu": 3754.58 }, { "epoch": 0.3619047619047619, "grad_norm": 115.37054783431222, "learning_rate": 7.857876700217507e-06, "loss": 1.3066, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 19, "tokens_per_second_per_gpu": 3763.17 }, { "epoch": 0.38095238095238093, "grad_norm": 72.03472195336599, "learning_rate": 7.820409988827649e-06, "loss": 1.2876, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 20, "tokens_per_second_per_gpu": 3750.15 }, { "epoch": 0.4, "grad_norm": 123.88987560365385, "learning_rate": 7.778676366431674e-06, "loss": 1.2854, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 21, "tokens_per_second_per_gpu": 3556.91 }, { "epoch": 0.41904761904761906, "grad_norm": 36.51030416393311, "learning_rate": 7.73272244416641e-06, "loss": 1.2799, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 22, "tokens_per_second_per_gpu": 3627.03 }, { "epoch": 0.4380952380952381, "grad_norm": 37.445205147197846, "learning_rate": 7.682599546705715e-06, "loss": 1.2835, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 23, "tokens_per_second_per_gpu": 3604.91 }, { "epoch": 0.45714285714285713, "grad_norm": 39.93974794828826, "learning_rate": 7.628363654937363e-06, "loss": 1.2947, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 24, "tokens_per_second_per_gpu": 3782.86 }, { "epoch": 0.47619047619047616, "grad_norm": 59.41355630536809, "learning_rate": 7.570075343439524e-06, "loss": 1.2702, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 25, "tokens_per_second_per_gpu": 3694.52 }, { "epoch": 0.49523809523809526, "grad_norm": 34.32373819297229, "learning_rate": 7.507799712826686e-06, "loss": 1.2984, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 26, "tokens_per_second_per_gpu": 3613.01 }, { "epoch": 0.5142857142857142, "grad_norm": 21.68779916764309, "learning_rate": 7.441606317040558e-06, "loss": 1.2827, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 27, "tokens_per_second_per_gpu": 3616.18 }, { "epoch": 0.5333333333333333, "grad_norm": 30.472648556953168, "learning_rate": 7.371569085667188e-06, "loss": 1.2801, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 28, "tokens_per_second_per_gpu": 3754.99 }, { "epoch": 0.5523809523809524, "grad_norm": 19.319274693345776, "learning_rate": 7.297766241367041e-06, "loss": 1.2693, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 29, "tokens_per_second_per_gpu": 3677.68 }, { "epoch": 0.5714285714285714, "grad_norm": 34.31430237097932, "learning_rate": 7.220280212510252e-06, "loss": 1.2581, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 30, "tokens_per_second_per_gpu": 3730.31 }, { "epoch": 0.5904761904761905, "grad_norm": 82.8518096206661, "learning_rate": 7.139197541114644e-06, "loss": 1.2687, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 31, "tokens_per_second_per_gpu": 3650.37 }, { "epoch": 0.6095238095238096, "grad_norm": 36.99675013730897, "learning_rate": 7.0546087861893285e-06, "loss": 1.2809, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 32, "tokens_per_second_per_gpu": 3785.35 }, { "epoch": 0.6285714285714286, "grad_norm": 10.853195813384238, "learning_rate": 6.96660842259183e-06, "loss": 1.253, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 33, "tokens_per_second_per_gpu": 3666.64 }, { "epoch": 0.6476190476190476, "grad_norm": 27.05353511161411, "learning_rate": 6.875294735511717e-06, "loss": 1.2601, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 34, "tokens_per_second_per_gpu": 3808.86 }, { "epoch": 0.6666666666666666, "grad_norm": 11.079685605370564, "learning_rate": 6.780769710698569e-06, "loss": 1.2539, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 35, "tokens_per_second_per_gpu": 3708.96 }, { "epoch": 0.6857142857142857, "grad_norm": 35.34021537624741, "learning_rate": 6.683138920556894e-06, "loss": 1.2362, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 36, "tokens_per_second_per_gpu": 3819.32 }, { "epoch": 0.7047619047619048, "grad_norm": 47.246402607795154, "learning_rate": 6.582511406235209e-06, "loss": 1.2429, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 37, "tokens_per_second_per_gpu": 3762.22 }, { "epoch": 0.7238095238095238, "grad_norm": 35.65219209343969, "learning_rate": 6.4789995558409795e-06, "loss": 1.2535, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 38, "tokens_per_second_per_gpu": 3496.79 }, { "epoch": 0.7428571428571429, "grad_norm": 13.147263166038922, "learning_rate": 6.3727189789174205e-06, "loss": 1.2421, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 39, "tokens_per_second_per_gpu": 3471.55 }, { "epoch": 0.7619047619047619, "grad_norm": 8.92693366901581, "learning_rate": 6.263788377322381e-06, "loss": 1.2587, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 40, "tokens_per_second_per_gpu": 3700.61 }, { "epoch": 0.780952380952381, "grad_norm": 25.621463437533773, "learning_rate": 6.152329412653491e-06, "loss": 1.2535, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 41, "tokens_per_second_per_gpu": 3696.17 }, { "epoch": 0.8, "grad_norm": 21.356947105637357, "learning_rate": 6.038466570367669e-06, "loss": 1.2437, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 42, "tokens_per_second_per_gpu": 3679.52 }, { "epoch": 0.819047619047619, "grad_norm": 21.528748134497796, "learning_rate": 5.922327020746735e-06, "loss": 1.2243, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 43, "tokens_per_second_per_gpu": 3654.06 }, { "epoch": 0.8380952380952381, "grad_norm": 14.734257530424147, "learning_rate": 5.804040476864407e-06, "loss": 1.2326, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 44, "tokens_per_second_per_gpu": 3581.66 }, { "epoch": 0.8571428571428571, "grad_norm": 13.129280834101875, "learning_rate": 5.68373904971334e-06, "loss": 1.2442, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 45, "tokens_per_second_per_gpu": 3788.2 }, { "epoch": 0.8761904761904762, "grad_norm": 14.976302382446457, "learning_rate": 5.561557100653979e-06, "loss": 1.2486, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 46, "tokens_per_second_per_gpu": 3636.88 }, { "epoch": 0.8952380952380953, "grad_norm": 15.967232506668388, "learning_rate": 5.43763109135005e-06, "loss": 1.2338, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 47, "tokens_per_second_per_gpu": 3759.31 }, { "epoch": 0.9142857142857143, "grad_norm": 16.354797247719976, "learning_rate": 5.312099431358276e-06, "loss": 1.2413, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 48, "tokens_per_second_per_gpu": 3663.89 }, { "epoch": 0.9333333333333333, "grad_norm": 6.665663198954394, "learning_rate": 5.185102323542536e-06, "loss": 1.2395, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 49, "tokens_per_second_per_gpu": 3727.2 }, { "epoch": 0.9523809523809523, "grad_norm": 9.1334624753648, "learning_rate": 5.056781607485144e-06, "loss": 1.2268, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 50, "tokens_per_second_per_gpu": 3870.66 }, { "epoch": 0.9714285714285714, "grad_norm": 17.527340590112377, "learning_rate": 4.927280601070113e-06, "loss": 1.2248, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 51, "tokens_per_second_per_gpu": 3582.22 }, { "epoch": 0.9904761904761905, "grad_norm": 19.222165420352905, "learning_rate": 4.796743940415344e-06, "loss": 1.2254, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 52, "tokens_per_second_per_gpu": 3727.73 }, { "epoch": 1.0, "grad_norm": 16.84364160949164, "learning_rate": 4.66531741833252e-06, "loss": 1.242, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.17, "memory/max_allocated (GiB)": 122.81, "step": 53, "tokens_per_second_per_gpu": 3750.91 }, { "epoch": 1.019047619047619, "grad_norm": 25.10526965511846, "learning_rate": 4.533147821495116e-06, "loss": 1.2426, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 54, "tokens_per_second_per_gpu": 3667.97 }, { "epoch": 1.0380952380952382, "grad_norm": 24.822314802816855, "learning_rate": 4.400382766496394e-06, "loss": 1.2394, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 55, "tokens_per_second_per_gpu": 3712.75 }, { "epoch": 1.0571428571428572, "grad_norm": 19.222938204469422, "learning_rate": 4.267170534980487e-06, "loss": 1.2269, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 56, "tokens_per_second_per_gpu": 3874.53 }, { "epoch": 1.0761904761904761, "grad_norm": 14.962813195503772, "learning_rate": 4.133659908030698e-06, "loss": 1.233, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 57, "tokens_per_second_per_gpu": 3626.61 }, { "epoch": 1.0952380952380953, "grad_norm": 23.099619927044888, "learning_rate": 4e-06, "loss": 1.2353, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 58, "tokens_per_second_per_gpu": 3631.82 }, { "epoch": 1.1142857142857143, "grad_norm": 14.683578827379744, "learning_rate": 3.8663400919693026e-06, "loss": 1.2261, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 59, "tokens_per_second_per_gpu": 3778.88 }, { "epoch": 1.1333333333333333, "grad_norm": 1363.244724375689, "learning_rate": 3.7328294650195136e-06, "loss": 1.2448, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 60, "tokens_per_second_per_gpu": 3648.86 }, { "epoch": 1.1523809523809523, "grad_norm": 37.56736283967858, "learning_rate": 3.5996172335036064e-06, "loss": 1.2134, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 61, "tokens_per_second_per_gpu": 3680.45 }, { "epoch": 1.1714285714285715, "grad_norm": 24.14759116678243, "learning_rate": 3.4668521785048856e-06, "loss": 1.2201, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 62, "tokens_per_second_per_gpu": 3742.93 }, { "epoch": 1.1904761904761905, "grad_norm": 20.895518933622306, "learning_rate": 3.3346825816674796e-06, "loss": 1.2248, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 63, "tokens_per_second_per_gpu": 3729.87 }, { "epoch": 1.2095238095238094, "grad_norm": 20.07417789192824, "learning_rate": 3.2032560595846563e-06, "loss": 1.2253, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 64, "tokens_per_second_per_gpu": 3664.34 }, { "epoch": 1.2285714285714286, "grad_norm": 14.61511907498168, "learning_rate": 3.0727193989298864e-06, "loss": 1.241, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 65, "tokens_per_second_per_gpu": 3721.56 }, { "epoch": 1.2476190476190476, "grad_norm": 18.1080641996899, "learning_rate": 2.943218392514856e-06, "loss": 1.2027, "memory/device_reserved (GiB)": 127.34, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 66, "tokens_per_second_per_gpu": 3589.14 }, { "epoch": 1.2666666666666666, "grad_norm": 88.35410261817876, "learning_rate": 2.8148976764574643e-06, "loss": 1.221, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 67, "tokens_per_second_per_gpu": 3718.05 }, { "epoch": 1.2857142857142856, "grad_norm": 23.72041286077318, "learning_rate": 2.6879005686417232e-06, "loss": 1.2172, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 68, "tokens_per_second_per_gpu": 3764.91 }, { "epoch": 1.3047619047619048, "grad_norm": 43.54234028579835, "learning_rate": 2.5623689086499492e-06, "loss": 1.2326, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 69, "tokens_per_second_per_gpu": 3359.73 }, { "epoch": 1.3238095238095238, "grad_norm": 6.104685395227184, "learning_rate": 2.4384428993460207e-06, "loss": 1.2427, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 70, "tokens_per_second_per_gpu": 3681.16 }, { "epoch": 1.342857142857143, "grad_norm": 9.963394838549585, "learning_rate": 2.3162609502866607e-06, "loss": 1.2322, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 71, "tokens_per_second_per_gpu": 3753.07 }, { "epoch": 1.361904761904762, "grad_norm": 43.43949979845249, "learning_rate": 2.195959523135592e-06, "loss": 1.2383, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 72, "tokens_per_second_per_gpu": 3764.97 }, { "epoch": 1.380952380952381, "grad_norm": 14.107017331391786, "learning_rate": 2.077672979253265e-06, "loss": 1.2225, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 73, "tokens_per_second_per_gpu": 3751.34 }, { "epoch": 1.4, "grad_norm": 10.549323906590455, "learning_rate": 1.96153342963233e-06, "loss": 1.2214, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 74, "tokens_per_second_per_gpu": 3559.51 }, { "epoch": 1.4190476190476191, "grad_norm": 18.592940657981064, "learning_rate": 1.8476705873465096e-06, "loss": 1.2171, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 75, "tokens_per_second_per_gpu": 3629.78 }, { "epoch": 1.438095238095238, "grad_norm": 11.120257290964485, "learning_rate": 1.7362116226776187e-06, "loss": 1.2226, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 76, "tokens_per_second_per_gpu": 3603.12 }, { "epoch": 1.457142857142857, "grad_norm": 7.078043688121306, "learning_rate": 1.627281021082579e-06, "loss": 1.2345, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 77, "tokens_per_second_per_gpu": 3780.85 }, { "epoch": 1.4761904761904763, "grad_norm": 5.000285151965608, "learning_rate": 1.521000444159021e-06, "loss": 1.2116, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 78, "tokens_per_second_per_gpu": 3695.41 }, { "epoch": 1.4952380952380953, "grad_norm": 47.84624251792891, "learning_rate": 1.4174885937647903e-06, "loss": 1.2405, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 79, "tokens_per_second_per_gpu": 3605.95 }, { "epoch": 1.5142857142857142, "grad_norm": 12.461343395029726, "learning_rate": 1.316861079443107e-06, "loss": 1.2272, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 80, "tokens_per_second_per_gpu": 3613.63 }, { "epoch": 1.5333333333333332, "grad_norm": 7.656217867750634, "learning_rate": 1.2192302893014308e-06, "loss": 1.2265, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 81, "tokens_per_second_per_gpu": 3752.87 }, { "epoch": 1.5523809523809524, "grad_norm": 15.082668616044355, "learning_rate": 1.1247052644882832e-06, "loss": 1.2183, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 82, "tokens_per_second_per_gpu": 3677.86 }, { "epoch": 1.5714285714285714, "grad_norm": 16.44949015042616, "learning_rate": 1.0333915774081697e-06, "loss": 1.2099, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.83, "memory/max_allocated (GiB)": 122.82, "step": 83, "tokens_per_second_per_gpu": 3729.01 }, { "epoch": 1.5904761904761906, "grad_norm": 12.211227945509856, "learning_rate": 9.453912138106721e-07, "loss": 1.2231, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 84, "tokens_per_second_per_gpu": 3649.49 }, { "epoch": 1.6095238095238096, "grad_norm": 7.074192518964132, "learning_rate": 8.60802458885356e-07, "loss": 1.237, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 85, "tokens_per_second_per_gpu": 3783.87 }, { "epoch": 1.6285714285714286, "grad_norm": 13.131068251165631, "learning_rate": 7.797197874897485e-07, "loss": 1.2116, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 86, "tokens_per_second_per_gpu": 3671.48 }, { "epoch": 1.6476190476190475, "grad_norm": 15.417850715738988, "learning_rate": 7.022337586329596e-07, "loss": 1.2209, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 87, "tokens_per_second_per_gpu": 3805.45 }, { "epoch": 1.6666666666666665, "grad_norm": 24.13403904325753, "learning_rate": 6.28430914332812e-07, "loss": 1.217, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 88, "tokens_per_second_per_gpu": 3706.88 }, { "epoch": 1.6857142857142857, "grad_norm": 13.576166990616798, "learning_rate": 5.583936829594433e-07, "loss": 1.2017, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 89, "tokens_per_second_per_gpu": 3820.7 }, { "epoch": 1.704761904761905, "grad_norm": 8.573005189398867, "learning_rate": 4.92200287173314e-07, "loss": 1.2096, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 90, "tokens_per_second_per_gpu": 3759.45 }, { "epoch": 1.723809523809524, "grad_norm": 5.5800010726124025, "learning_rate": 4.299246565604755e-07, "loss": 1.2218, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 91, "tokens_per_second_per_gpu": 3499.8 }, { "epoch": 1.7428571428571429, "grad_norm": 6.765368030458938, "learning_rate": 3.716363450626372e-07, "loss": 1.2117, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 92, "tokens_per_second_per_gpu": 3468.37 }, { "epoch": 1.7619047619047619, "grad_norm": 7.504548685452772, "learning_rate": 3.174004532942844e-07, "loss": 1.2299, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 93, "tokens_per_second_per_gpu": 3700.98 }, { "epoch": 1.7809523809523808, "grad_norm": 8.649122371866438, "learning_rate": 2.672775558335898e-07, "loss": 1.2265, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 94, "tokens_per_second_per_gpu": 3700.55 }, { "epoch": 1.8, "grad_norm": 11.91832294221251, "learning_rate": 2.2132363356832528e-07, "loss": 1.2185, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 95, "tokens_per_second_per_gpu": 3680.96 }, { "epoch": 1.819047619047619, "grad_norm": 9.186156818821193, "learning_rate": 1.795900111723503e-07, "loss": 1.2008, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 96, "tokens_per_second_per_gpu": 3658.84 }, { "epoch": 1.8380952380952382, "grad_norm": 13.72977541399496, "learning_rate": 1.4212329978249415e-07, "loss": 1.2104, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.99, "memory/max_allocated (GiB)": 122.82, "step": 97, "tokens_per_second_per_gpu": 3581.27 }, { "epoch": 1.8571428571428572, "grad_norm": 6.290457692715211, "learning_rate": 1.0896534493985177e-07, "loss": 1.223, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 98, "tokens_per_second_per_gpu": 3791.37 }, { "epoch": 1.8761904761904762, "grad_norm": 9.702798624165407, "learning_rate": 8.0153179853653e-08, "loss": 1.2285, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 99, "tokens_per_second_per_gpu": 3639.09 }, { "epoch": 1.8952380952380952, "grad_norm": 11.005975725667684, "learning_rate": 5.571898403988573e-08, "loss": 1.2151, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 100, "tokens_per_second_per_gpu": 3757.98 }, { "epoch": 1.9142857142857141, "grad_norm": 8.44842365055977, "learning_rate": 3.569004738087988e-08, "loss": 1.2238, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 101, "tokens_per_second_per_gpu": 3661.88 }, { "epoch": 1.9333333333333333, "grad_norm": 4.816542675360639, "learning_rate": 2.0088739645983455e-08, "loss": 1.2232, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 102, "tokens_per_second_per_gpu": 3730.52 }, { "epoch": 1.9523809523809523, "grad_norm": 11.749396247795026, "learning_rate": 8.932485507387344e-09, "loss": 1.2118, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 103, "tokens_per_second_per_gpu": 3871.77 }, { "epoch": 1.9714285714285715, "grad_norm": 7.9532371124526104, "learning_rate": 2.2337450789815526e-09, "loss": 1.2109, "memory/device_reserved (GiB)": 127.42, "memory/max_active (GiB)": 124.18, "memory/max_allocated (GiB)": 122.82, "step": 104, "tokens_per_second_per_gpu": 3582.44 } ], "logging_steps": 1, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1428859668922368.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }