| { | |
| "best_global_step": 210, | |
| "best_metric": 0.34583956003189087, | |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_42_1760466772/checkpoint-210", | |
| "epoch": 30.0, | |
| "eval_steps": 42, | |
| "global_step": 840, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 175.3666534423828, | |
| "learning_rate": 3.1746031746031746e-06, | |
| "loss": 11.3214, | |
| "num_input_tokens_seen": 9216, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 145.5986785888672, | |
| "learning_rate": 7.142857142857143e-06, | |
| "loss": 6.8093, | |
| "num_input_tokens_seen": 18112, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 58.49702453613281, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 2.1384, | |
| "num_input_tokens_seen": 26624, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 57.37192153930664, | |
| "learning_rate": 1.5079365079365079e-05, | |
| "loss": 0.7391, | |
| "num_input_tokens_seen": 35456, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 20.545907974243164, | |
| "learning_rate": 1.9047619047619046e-05, | |
| "loss": 0.5556, | |
| "num_input_tokens_seen": 44096, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 25.21103286743164, | |
| "learning_rate": 2.3015873015873015e-05, | |
| "loss": 0.5487, | |
| "num_input_tokens_seen": 52192, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 8.62804126739502, | |
| "learning_rate": 2.6984126984126984e-05, | |
| "loss": 0.4509, | |
| "num_input_tokens_seen": 60896, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 23.560665130615234, | |
| "learning_rate": 3.095238095238095e-05, | |
| "loss": 0.4818, | |
| "num_input_tokens_seen": 69920, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.3624400794506073, | |
| "eval_runtime": 1.118, | |
| "eval_samples_per_second": 99.287, | |
| "eval_steps_per_second": 12.523, | |
| "num_input_tokens_seen": 73824, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 13.207117080688477, | |
| "learning_rate": 3.492063492063492e-05, | |
| "loss": 0.4103, | |
| "num_input_tokens_seen": 78560, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 11.927769660949707, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.4405, | |
| "num_input_tokens_seen": 87136, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 7.089166641235352, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.408, | |
| "num_input_tokens_seen": 96480, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 2.959946393966675, | |
| "learning_rate": 4.682539682539683e-05, | |
| "loss": 0.3548, | |
| "num_input_tokens_seen": 104248, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.3214285714285716, | |
| "grad_norm": 1.298177719116211, | |
| "learning_rate": 5.0793650793650794e-05, | |
| "loss": 0.377, | |
| "num_input_tokens_seen": 113272, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 8.108621597290039, | |
| "learning_rate": 5.4761904761904766e-05, | |
| "loss": 0.3922, | |
| "num_input_tokens_seen": 122168, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.678571428571429, | |
| "grad_norm": 2.4724233150482178, | |
| "learning_rate": 5.873015873015873e-05, | |
| "loss": 0.4379, | |
| "num_input_tokens_seen": 131576, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 4.108138084411621, | |
| "learning_rate": 6.26984126984127e-05, | |
| "loss": 0.3847, | |
| "num_input_tokens_seen": 139832, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.3848850131034851, | |
| "eval_runtime": 1.1397, | |
| "eval_samples_per_second": 97.398, | |
| "eval_steps_per_second": 12.284, | |
| "num_input_tokens_seen": 146552, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 3.0357142857142856, | |
| "grad_norm": 0.5216990113258362, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.3722, | |
| "num_input_tokens_seen": 147832, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 3.2142857142857144, | |
| "grad_norm": 0.9133898615837097, | |
| "learning_rate": 7.063492063492065e-05, | |
| "loss": 0.4223, | |
| "num_input_tokens_seen": 157176, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 3.392857142857143, | |
| "grad_norm": 4.493597030639648, | |
| "learning_rate": 7.460317460317461e-05, | |
| "loss": 0.388, | |
| "num_input_tokens_seen": 166392, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 0.2193046361207962, | |
| "learning_rate": 7.857142857142858e-05, | |
| "loss": 0.3655, | |
| "num_input_tokens_seen": 174904, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 3.4654486179351807, | |
| "learning_rate": 8.253968253968255e-05, | |
| "loss": 0.4285, | |
| "num_input_tokens_seen": 184312, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 3.928571428571429, | |
| "grad_norm": 0.2327776700258255, | |
| "learning_rate": 8.650793650793651e-05, | |
| "loss": 0.3877, | |
| "num_input_tokens_seen": 193464, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 4.107142857142857, | |
| "grad_norm": 0.6309562921524048, | |
| "learning_rate": 9.047619047619048e-05, | |
| "loss": 0.3744, | |
| "num_input_tokens_seen": 201168, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 4.285714285714286, | |
| "grad_norm": 0.7549311518669128, | |
| "learning_rate": 9.444444444444444e-05, | |
| "loss": 0.4027, | |
| "num_input_tokens_seen": 210384, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.464285714285714, | |
| "grad_norm": 0.4532535672187805, | |
| "learning_rate": 9.841269841269841e-05, | |
| "loss": 0.3464, | |
| "num_input_tokens_seen": 219536, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.34812429547309875, | |
| "eval_runtime": 1.0994, | |
| "eval_samples_per_second": 100.966, | |
| "eval_steps_per_second": 12.735, | |
| "num_input_tokens_seen": 221264, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 4.642857142857143, | |
| "grad_norm": 0.14779126644134521, | |
| "learning_rate": 9.999564408362054e-05, | |
| "loss": 0.333, | |
| "num_input_tokens_seen": 228432, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 4.821428571428571, | |
| "grad_norm": 0.35166576504707336, | |
| "learning_rate": 9.996902734308346e-05, | |
| "loss": 0.3557, | |
| "num_input_tokens_seen": 238032, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.6182653307914734, | |
| "learning_rate": 9.991822668185927e-05, | |
| "loss": 0.3749, | |
| "num_input_tokens_seen": 245376, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 5.178571428571429, | |
| "grad_norm": 0.5397220253944397, | |
| "learning_rate": 9.984326668636131e-05, | |
| "loss": 0.3563, | |
| "num_input_tokens_seen": 253952, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 5.357142857142857, | |
| "grad_norm": 0.11434465646743774, | |
| "learning_rate": 9.974418363559444e-05, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 263296, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 5.535714285714286, | |
| "grad_norm": 0.9336232542991638, | |
| "learning_rate": 9.96210254835968e-05, | |
| "loss": 0.3606, | |
| "num_input_tokens_seen": 271808, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 5.714285714285714, | |
| "grad_norm": 0.10893812775611877, | |
| "learning_rate": 9.947385183623098e-05, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 280704, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.892857142857143, | |
| "grad_norm": 0.15622828900814056, | |
| "learning_rate": 9.930273392233624e-05, | |
| "loss": 0.3578, | |
| "num_input_tokens_seen": 289600, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.37425312399864197, | |
| "eval_runtime": 1.2693, | |
| "eval_samples_per_second": 87.447, | |
| "eval_steps_per_second": 11.029, | |
| "num_input_tokens_seen": 294256, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 6.071428571428571, | |
| "grad_norm": 0.267703652381897, | |
| "learning_rate": 9.910775455925518e-05, | |
| "loss": 0.3546, | |
| "num_input_tokens_seen": 297520, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.12959004938602448, | |
| "learning_rate": 9.888900811275204e-05, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 306672, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 6.428571428571429, | |
| "grad_norm": 0.25042709708213806, | |
| "learning_rate": 9.864660045134165e-05, | |
| "loss": 0.3929, | |
| "num_input_tokens_seen": 315824, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 6.607142857142857, | |
| "grad_norm": 0.1262078881263733, | |
| "learning_rate": 9.838064889505141e-05, | |
| "loss": 0.35, | |
| "num_input_tokens_seen": 324464, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 6.785714285714286, | |
| "grad_norm": 0.2343859225511551, | |
| "learning_rate": 9.809128215864097e-05, | |
| "loss": 0.36, | |
| "num_input_tokens_seen": 332976, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 6.964285714285714, | |
| "grad_norm": 0.2336055040359497, | |
| "learning_rate": 9.777864028930705e-05, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 341552, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 7.142857142857143, | |
| "grad_norm": 0.34513550996780396, | |
| "learning_rate": 9.744287459890368e-05, | |
| "loss": 0.352, | |
| "num_input_tokens_seen": 349584, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 7.321428571428571, | |
| "grad_norm": 0.5254360437393188, | |
| "learning_rate": 9.708414759071059e-05, | |
| "loss": 0.3669, | |
| "num_input_tokens_seen": 358672, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.04271647334098816, | |
| "learning_rate": 9.670263288078502e-05, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 368144, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "eval_loss": 0.34583956003189087, | |
| "eval_runtime": 1.1811, | |
| "eval_samples_per_second": 93.983, | |
| "eval_steps_per_second": 11.854, | |
| "num_input_tokens_seen": 368144, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 7.678571428571429, | |
| "grad_norm": 0.2870471179485321, | |
| "learning_rate": 9.629851511393555e-05, | |
| "loss": 0.3575, | |
| "num_input_tokens_seen": 376464, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 7.857142857142857, | |
| "grad_norm": 0.16233104467391968, | |
| "learning_rate": 9.587198987435782e-05, | |
| "loss": 0.347, | |
| "num_input_tokens_seen": 385616, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 8.035714285714286, | |
| "grad_norm": 0.24467554688453674, | |
| "learning_rate": 9.542326359097619e-05, | |
| "loss": 0.3491, | |
| "num_input_tokens_seen": 393384, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 8.214285714285714, | |
| "grad_norm": 0.2640959918498993, | |
| "learning_rate": 9.495255343753657e-05, | |
| "loss": 0.3562, | |
| "num_input_tokens_seen": 402280, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 8.392857142857142, | |
| "grad_norm": 0.18736229836940765, | |
| "learning_rate": 9.446008722749905e-05, | |
| "loss": 0.3491, | |
| "num_input_tokens_seen": 410280, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 0.039574023336172104, | |
| "learning_rate": 9.394610330378124e-05, | |
| "loss": 0.3404, | |
| "num_input_tokens_seen": 418856, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 0.12487329542636871, | |
| "learning_rate": 9.341085042340532e-05, | |
| "loss": 0.3528, | |
| "num_input_tokens_seen": 427816, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 8.928571428571429, | |
| "grad_norm": 0.08902257680892944, | |
| "learning_rate": 9.285458763710524e-05, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 437352, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.34945669770240784, | |
| "eval_runtime": 1.1316, | |
| "eval_samples_per_second": 98.092, | |
| "eval_steps_per_second": 12.372, | |
| "num_input_tokens_seen": 439768, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 9.107142857142858, | |
| "grad_norm": 0.12130332738161087, | |
| "learning_rate": 9.227758416395169e-05, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 444504, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 9.285714285714286, | |
| "grad_norm": 0.14609932899475098, | |
| "learning_rate": 9.168011926105598e-05, | |
| "loss": 0.3565, | |
| "num_input_tokens_seen": 453144, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 9.464285714285714, | |
| "grad_norm": 0.025746047496795654, | |
| "learning_rate": 9.106248208841569e-05, | |
| "loss": 0.3499, | |
| "num_input_tokens_seen": 462040, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 9.642857142857142, | |
| "grad_norm": 0.10566066205501556, | |
| "learning_rate": 9.042497156896748e-05, | |
| "loss": 0.349, | |
| "num_input_tokens_seen": 471576, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 9.821428571428571, | |
| "grad_norm": 0.2756125032901764, | |
| "learning_rate": 8.976789624391498e-05, | |
| "loss": 0.3532, | |
| "num_input_tokens_seen": 480280, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.05616496875882149, | |
| "learning_rate": 8.90915741234015e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 489096, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 10.178571428571429, | |
| "grad_norm": 0.23147229850292206, | |
| "learning_rate": 8.839633253260006e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 498952, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 10.357142857142858, | |
| "grad_norm": 0.17449620366096497, | |
| "learning_rate": 8.768250795329518e-05, | |
| "loss": 0.34, | |
| "num_input_tokens_seen": 507976, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "eval_loss": 0.35035043954849243, | |
| "eval_runtime": 1.1011, | |
| "eval_samples_per_second": 100.804, | |
| "eval_steps_per_second": 12.714, | |
| "num_input_tokens_seen": 514888, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 10.535714285714286, | |
| "grad_norm": 0.09387495368719101, | |
| "learning_rate": 8.695044586103296e-05, | |
| "loss": 0.355, | |
| "num_input_tokens_seen": 516744, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 10.714285714285714, | |
| "grad_norm": 0.269589364528656, | |
| "learning_rate": 8.620050055791851e-05, | |
| "loss": 0.3472, | |
| "num_input_tokens_seen": 525960, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 10.892857142857142, | |
| "grad_norm": 0.045772165060043335, | |
| "learning_rate": 8.543303500114141e-05, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 534344, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 11.071428571428571, | |
| "grad_norm": 0.23182272911071777, | |
| "learning_rate": 8.464842062731235e-05, | |
| "loss": 0.3481, | |
| "num_input_tokens_seen": 541856, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 11.25, | |
| "grad_norm": 0.34456300735473633, | |
| "learning_rate": 8.384703717269584e-05, | |
| "loss": 0.3448, | |
| "num_input_tokens_seen": 550176, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 11.428571428571429, | |
| "grad_norm": 0.04894077032804489, | |
| "learning_rate": 8.302927248942627e-05, | |
| "loss": 0.3405, | |
| "num_input_tokens_seen": 558368, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 11.607142857142858, | |
| "grad_norm": 0.33567559719085693, | |
| "learning_rate": 8.219552235779578e-05, | |
| "loss": 0.3466, | |
| "num_input_tokens_seen": 567392, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 11.785714285714286, | |
| "grad_norm": 0.2454521507024765, | |
| "learning_rate": 8.134619029470534e-05, | |
| "loss": 0.3594, | |
| "num_input_tokens_seen": 576864, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 11.964285714285714, | |
| "grad_norm": 0.41574931144714355, | |
| "learning_rate": 8.048168735837121e-05, | |
| "loss": 0.3578, | |
| "num_input_tokens_seen": 585376, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.3650008738040924, | |
| "eval_runtime": 1.1362, | |
| "eval_samples_per_second": 97.697, | |
| "eval_steps_per_second": 12.322, | |
| "num_input_tokens_seen": 586448, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 12.142857142857142, | |
| "grad_norm": 0.13403870165348053, | |
| "learning_rate": 7.960243194938192e-05, | |
| "loss": 0.3703, | |
| "num_input_tokens_seen": 593296, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 12.321428571428571, | |
| "grad_norm": 0.03592513129115105, | |
| "learning_rate": 7.87088496082013e-05, | |
| "loss": 0.3383, | |
| "num_input_tokens_seen": 602000, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.30938151478767395, | |
| "learning_rate": 7.780137280921636e-05, | |
| "loss": 0.3849, | |
| "num_input_tokens_seen": 610960, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 12.678571428571429, | |
| "grad_norm": 0.047836095094680786, | |
| "learning_rate": 7.688044075142887e-05, | |
| "loss": 0.3439, | |
| "num_input_tokens_seen": 619984, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 12.857142857142858, | |
| "grad_norm": 0.08763613551855087, | |
| "learning_rate": 7.594649914589287e-05, | |
| "loss": 0.3448, | |
| "num_input_tokens_seen": 629776, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 13.035714285714286, | |
| "grad_norm": 0.0415462851524353, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 637888, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 13.214285714285714, | |
| "grad_norm": 0.12538637220859528, | |
| "learning_rate": 7.404140139871797e-05, | |
| "loss": 0.345, | |
| "num_input_tokens_seen": 648128, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 13.392857142857142, | |
| "grad_norm": 0.17965561151504517, | |
| "learning_rate": 7.307116728288727e-05, | |
| "loss": 0.3462, | |
| "num_input_tokens_seen": 656768, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "eval_loss": 0.3538319170475006, | |
| "eval_runtime": 1.1141, | |
| "eval_samples_per_second": 99.628, | |
| "eval_steps_per_second": 12.566, | |
| "num_input_tokens_seen": 662016, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 13.571428571428571, | |
| "grad_norm": 0.22999395430088043, | |
| "learning_rate": 7.208976722468392e-05, | |
| "loss": 0.3606, | |
| "num_input_tokens_seen": 665472, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 13.75, | |
| "grad_norm": 0.0893191397190094, | |
| "learning_rate": 7.109767620035689e-05, | |
| "loss": 0.3501, | |
| "num_input_tokens_seen": 673664, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 13.928571428571429, | |
| "grad_norm": 0.09730294346809387, | |
| "learning_rate": 7.00953743603498e-05, | |
| "loss": 0.3461, | |
| "num_input_tokens_seen": 682688, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 14.107142857142858, | |
| "grad_norm": 0.05350238084793091, | |
| "learning_rate": 6.908334679691863e-05, | |
| "loss": 0.3522, | |
| "num_input_tokens_seen": 690936, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 14.285714285714286, | |
| "grad_norm": 0.04386021941900253, | |
| "learning_rate": 6.806208330935766e-05, | |
| "loss": 0.3553, | |
| "num_input_tokens_seen": 700536, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 14.464285714285714, | |
| "grad_norm": 0.0879250317811966, | |
| "learning_rate": 6.703207816694719e-05, | |
| "loss": 0.3382, | |
| "num_input_tokens_seen": 709560, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 14.642857142857142, | |
| "grad_norm": 0.31405237317085266, | |
| "learning_rate": 6.599382986973808e-05, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 718584, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 14.821428571428571, | |
| "grad_norm": 0.3567558228969574, | |
| "learning_rate": 6.494784090728852e-05, | |
| "loss": 0.3544, | |
| "num_input_tokens_seen": 727416, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.061139896512031555, | |
| "learning_rate": 6.389461751547008e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 735680, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.35569673776626587, | |
| "eval_runtime": 1.1923, | |
| "eval_samples_per_second": 93.099, | |
| "eval_steps_per_second": 11.742, | |
| "num_input_tokens_seen": 735680, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 15.178571428571429, | |
| "grad_norm": 0.19610817730426788, | |
| "learning_rate": 6.283466943146053e-05, | |
| "loss": 0.3486, | |
| "num_input_tokens_seen": 744832, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 15.357142857142858, | |
| "grad_norm": 0.05017664283514023, | |
| "learning_rate": 6.176850964704213e-05, | |
| "loss": 0.3573, | |
| "num_input_tokens_seen": 753728, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 15.535714285714286, | |
| "grad_norm": 0.047355543822050095, | |
| "learning_rate": 6.069665416032487e-05, | |
| "loss": 0.3564, | |
| "num_input_tokens_seen": 762752, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 15.714285714285714, | |
| "grad_norm": 0.16390874981880188, | |
| "learning_rate": 5.961962172601458e-05, | |
| "loss": 0.3461, | |
| "num_input_tokens_seen": 771200, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 15.892857142857142, | |
| "grad_norm": 0.19481535255908966, | |
| "learning_rate": 5.853793360434687e-05, | |
| "loss": 0.3564, | |
| "num_input_tokens_seen": 779776, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 16.071428571428573, | |
| "grad_norm": 0.04299188032746315, | |
| "learning_rate": 5.745211330880872e-05, | |
| "loss": 0.3214, | |
| "num_input_tokens_seen": 788984, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 16.25, | |
| "grad_norm": 0.07021050900220871, | |
| "learning_rate": 5.636268635276918e-05, | |
| "loss": 0.355, | |
| "num_input_tokens_seen": 798456, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 16.428571428571427, | |
| "grad_norm": 0.2501037120819092, | |
| "learning_rate": 5.527017999514239e-05, | |
| "loss": 0.3489, | |
| "num_input_tokens_seen": 807352, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "eval_loss": 0.35193607211112976, | |
| "eval_runtime": 1.1402, | |
| "eval_samples_per_second": 97.356, | |
| "eval_steps_per_second": 12.279, | |
| "num_input_tokens_seen": 810232, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 16.607142857142858, | |
| "grad_norm": 0.08860552310943604, | |
| "learning_rate": 5.417512298520585e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 814712, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 16.785714285714285, | |
| "grad_norm": 0.08465074002742767, | |
| "learning_rate": 5.307804530669716e-05, | |
| "loss": 0.346, | |
| "num_input_tokens_seen": 823608, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 16.964285714285715, | |
| "grad_norm": 0.14993856847286224, | |
| "learning_rate": 5.197947792131348e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 832888, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 17.142857142857142, | |
| "grad_norm": 0.17248603701591492, | |
| "learning_rate": 5.0879952511737696e-05, | |
| "loss": 0.3452, | |
| "num_input_tokens_seen": 840464, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 17.321428571428573, | |
| "grad_norm": 0.10642991960048676, | |
| "learning_rate": 4.97800012243155e-05, | |
| "loss": 0.3402, | |
| "num_input_tokens_seen": 849488, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.07554838806390762, | |
| "learning_rate": 4.86801564115082e-05, | |
| "loss": 0.3428, | |
| "num_input_tokens_seen": 858448, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 17.678571428571427, | |
| "grad_norm": 0.1515018343925476, | |
| "learning_rate": 4.758095037424567e-05, | |
| "loss": 0.352, | |
| "num_input_tokens_seen": 867280, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 17.857142857142858, | |
| "grad_norm": 0.2787249684333801, | |
| "learning_rate": 4.648291510430438e-05, | |
| "loss": 0.3528, | |
| "num_input_tokens_seen": 876880, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.3557737469673157, | |
| "eval_runtime": 1.1986, | |
| "eval_samples_per_second": 92.609, | |
| "eval_steps_per_second": 11.68, | |
| "num_input_tokens_seen": 882920, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 18.035714285714285, | |
| "grad_norm": 0.21712534129619598, | |
| "learning_rate": 4.5386582026834906e-05, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 885480, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 18.214285714285715, | |
| "grad_norm": 0.05514955148100853, | |
| "learning_rate": 4.4292481743163755e-05, | |
| "loss": 0.3441, | |
| "num_input_tokens_seen": 893864, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 18.392857142857142, | |
| "grad_norm": 0.06789080053567886, | |
| "learning_rate": 4.3201143773993865e-05, | |
| "loss": 0.3524, | |
| "num_input_tokens_seen": 901928, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 18.571428571428573, | |
| "grad_norm": 0.11630789190530777, | |
| "learning_rate": 4.2113096303128125e-05, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 910696, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 18.75, | |
| "grad_norm": 0.13940396904945374, | |
| "learning_rate": 4.102886592183996e-05, | |
| "loss": 0.3378, | |
| "num_input_tokens_seen": 920104, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 18.928571428571427, | |
| "grad_norm": 0.051015470176935196, | |
| "learning_rate": 3.9948977374014544e-05, | |
| "loss": 0.3511, | |
| "num_input_tokens_seen": 928936, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 19.107142857142858, | |
| "grad_norm": 0.09734626859426498, | |
| "learning_rate": 3.887395330218429e-05, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 937840, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 19.285714285714285, | |
| "grad_norm": 0.11513973772525787, | |
| "learning_rate": 3.780431399458114e-05, | |
| "loss": 0.3478, | |
| "num_input_tokens_seen": 947248, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 19.464285714285715, | |
| "grad_norm": 0.11863286793231964, | |
| "learning_rate": 3.6740577133328524e-05, | |
| "loss": 0.3408, | |
| "num_input_tokens_seen": 955568, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "eval_loss": 0.3516767919063568, | |
| "eval_runtime": 1.1222, | |
| "eval_samples_per_second": 98.916, | |
| "eval_steps_per_second": 12.476, | |
| "num_input_tokens_seen": 957488, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 19.642857142857142, | |
| "grad_norm": 0.04675190523266792, | |
| "learning_rate": 3.568325754389438e-05, | |
| "loss": 0.3459, | |
| "num_input_tokens_seen": 964400, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 19.821428571428573, | |
| "grad_norm": 0.13177751004695892, | |
| "learning_rate": 3.4632866945926855e-05, | |
| "loss": 0.3473, | |
| "num_input_tokens_seen": 972720, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.05624629184603691, | |
| "learning_rate": 3.3589913705593235e-05, | |
| "loss": 0.3508, | |
| "num_input_tokens_seen": 980760, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 20.178571428571427, | |
| "grad_norm": 0.04487062245607376, | |
| "learning_rate": 3.255490258954167e-05, | |
| "loss": 0.3414, | |
| "num_input_tokens_seen": 989464, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 20.357142857142858, | |
| "grad_norm": 0.3037759065628052, | |
| "learning_rate": 3.152833452060522e-05, | |
| "loss": 0.3518, | |
| "num_input_tokens_seen": 998680, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 20.535714285714285, | |
| "grad_norm": 0.34507808089256287, | |
| "learning_rate": 3.0510706335366035e-05, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 1007768, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 20.714285714285715, | |
| "grad_norm": 0.32024624943733215, | |
| "learning_rate": 2.9502510543697325e-05, | |
| "loss": 0.3549, | |
| "num_input_tokens_seen": 1016536, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 20.892857142857142, | |
| "grad_norm": 0.0999528020620346, | |
| "learning_rate": 2.850423509039928e-05, | |
| "loss": 0.3469, | |
| "num_input_tokens_seen": 1025048, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.35418227314949036, | |
| "eval_runtime": 1.282, | |
| "eval_samples_per_second": 86.583, | |
| "eval_steps_per_second": 10.92, | |
| "num_input_tokens_seen": 1029792, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 21.071428571428573, | |
| "grad_norm": 0.10501652210950851, | |
| "learning_rate": 2.751636311904444e-05, | |
| "loss": 0.343, | |
| "num_input_tokens_seen": 1032864, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 21.25, | |
| "grad_norm": 0.04894215986132622, | |
| "learning_rate": 2.6539372738146695e-05, | |
| "loss": 0.3426, | |
| "num_input_tokens_seen": 1041248, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 21.428571428571427, | |
| "grad_norm": 0.11162778735160828, | |
| "learning_rate": 2.5573736789767232e-05, | |
| "loss": 0.3358, | |
| "num_input_tokens_seen": 1050720, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 21.607142857142858, | |
| "grad_norm": 0.11263741552829742, | |
| "learning_rate": 2.4619922620669218e-05, | |
| "loss": 0.3549, | |
| "num_input_tokens_seen": 1060064, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 21.785714285714285, | |
| "grad_norm": 0.2273244857788086, | |
| "learning_rate": 2.3678391856132204e-05, | |
| "loss": 0.3524, | |
| "num_input_tokens_seen": 1068256, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 21.964285714285715, | |
| "grad_norm": 0.23367320001125336, | |
| "learning_rate": 2.2749600176535534e-05, | |
| "loss": 0.3474, | |
| "num_input_tokens_seen": 1077024, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 22.142857142857142, | |
| "grad_norm": 0.17969422042369843, | |
| "learning_rate": 2.1833997096818898e-05, | |
| "loss": 0.3425, | |
| "num_input_tokens_seen": 1086328, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 22.321428571428573, | |
| "grad_norm": 0.20163026452064514, | |
| "learning_rate": 2.0932025748927013e-05, | |
| "loss": 0.3438, | |
| "num_input_tokens_seen": 1094520, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 22.5, | |
| "grad_norm": 0.23486794531345367, | |
| "learning_rate": 2.0044122667343297e-05, | |
| "loss": 0.3488, | |
| "num_input_tokens_seen": 1103160, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 22.5, | |
| "eval_loss": 0.35537073016166687, | |
| "eval_runtime": 1.1326, | |
| "eval_samples_per_second": 98.004, | |
| "eval_steps_per_second": 12.361, | |
| "num_input_tokens_seen": 1103160, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 22.678571428571427, | |
| "grad_norm": 0.04255451634526253, | |
| "learning_rate": 1.917071757781679e-05, | |
| "loss": 0.3465, | |
| "num_input_tokens_seen": 1112376, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 22.857142857142858, | |
| "grad_norm": 0.07782665640115738, | |
| "learning_rate": 1.831223318938419e-05, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 1121464, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 23.035714285714285, | |
| "grad_norm": 0.11463826894760132, | |
| "learning_rate": 1.746908498978791e-05, | |
| "loss": 0.3526, | |
| "num_input_tokens_seen": 1130152, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 23.214285714285715, | |
| "grad_norm": 0.07137856632471085, | |
| "learning_rate": 1.6641681044389014e-05, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 1138664, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 23.392857142857142, | |
| "grad_norm": 0.08257535845041275, | |
| "learning_rate": 1.5830421798672568e-05, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 1146216, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 23.571428571428573, | |
| "grad_norm": 0.0766848772764206, | |
| "learning_rate": 1.5035699884440697e-05, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 1155496, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 23.75, | |
| "grad_norm": 0.08535895496606827, | |
| "learning_rate": 1.4257899929787294e-05, | |
| "loss": 0.3515, | |
| "num_input_tokens_seen": 1164840, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 23.928571428571427, | |
| "grad_norm": 0.09435199946165085, | |
| "learning_rate": 1.3497398372946501e-05, | |
| "loss": 0.3402, | |
| "num_input_tokens_seen": 1173736, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.35293668508529663, | |
| "eval_runtime": 1.2061, | |
| "eval_samples_per_second": 92.034, | |
| "eval_steps_per_second": 11.608, | |
| "num_input_tokens_seen": 1176968, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 24.107142857142858, | |
| "grad_norm": 0.05331406742334366, | |
| "learning_rate": 1.2754563280104714e-05, | |
| "loss": 0.3404, | |
| "num_input_tokens_seen": 1182344, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 24.285714285714285, | |
| "grad_norm": 0.14441683888435364, | |
| "learning_rate": 1.202975416726464e-05, | |
| "loss": 0.357, | |
| "num_input_tokens_seen": 1191944, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 24.464285714285715, | |
| "grad_norm": 0.06084274500608444, | |
| "learning_rate": 1.1323321826247346e-05, | |
| "loss": 0.3355, | |
| "num_input_tokens_seen": 1200520, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 24.642857142857142, | |
| "grad_norm": 0.13699252903461456, | |
| "learning_rate": 1.0635608154916648e-05, | |
| "loss": 0.3433, | |
| "num_input_tokens_seen": 1209288, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 24.821428571428573, | |
| "grad_norm": 0.14677266776561737, | |
| "learning_rate": 9.966945991708005e-06, | |
| "loss": 0.3554, | |
| "num_input_tokens_seen": 1217544, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 0.23254753649234772, | |
| "learning_rate": 9.317658954541992e-06, | |
| "loss": 0.3439, | |
| "num_input_tokens_seen": 1224848, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 25.178571428571427, | |
| "grad_norm": 0.1202574372291565, | |
| "learning_rate": 8.688061284200266e-06, | |
| "loss": 0.3371, | |
| "num_input_tokens_seen": 1234064, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 25.357142857142858, | |
| "grad_norm": 0.055635951459407806, | |
| "learning_rate": 8.07845769223981e-06, | |
| "loss": 0.3422, | |
| "num_input_tokens_seen": 1242448, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 25.5, | |
| "eval_loss": 0.35582345724105835, | |
| "eval_runtime": 1.0722, | |
| "eval_samples_per_second": 103.524, | |
| "eval_steps_per_second": 13.057, | |
| "num_input_tokens_seen": 1250064, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 25.535714285714285, | |
| "grad_norm": 0.29359593987464905, | |
| "learning_rate": 7.489143213519301e-06, | |
| "loss": 0.352, | |
| "num_input_tokens_seen": 1251536, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 25.714285714285715, | |
| "grad_norm": 0.08035387843847275, | |
| "learning_rate": 6.920403063408526e-06, | |
| "loss": 0.3426, | |
| "num_input_tokens_seen": 1259792, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 25.892857142857142, | |
| "grad_norm": 0.21466954052448273, | |
| "learning_rate": 6.372512499750471e-06, | |
| "loss": 0.3496, | |
| "num_input_tokens_seen": 1268560, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 26.071428571428573, | |
| "grad_norm": 0.15136410295963287, | |
| "learning_rate": 5.845736689642472e-06, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 1276648, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 26.25, | |
| "grad_norm": 0.13561971485614777, | |
| "learning_rate": 5.3403305811010885e-06, | |
| "loss": 0.3406, | |
| "num_input_tokens_seen": 1287144, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 26.428571428571427, | |
| "grad_norm": 0.07142584770917892, | |
| "learning_rate": 4.8565387796728865e-06, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 1294504, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 26.607142857142858, | |
| "grad_norm": 0.2093130499124527, | |
| "learning_rate": 4.394595430050613e-06, | |
| "loss": 0.3485, | |
| "num_input_tokens_seen": 1303336, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 26.785714285714285, | |
| "grad_norm": 0.07084821909666061, | |
| "learning_rate": 3.954724102752316e-06, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 1312296, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 26.964285714285715, | |
| "grad_norm": 0.1166943907737732, | |
| "learning_rate": 3.537137685918074e-06, | |
| "loss": 0.3474, | |
| "num_input_tokens_seen": 1319912, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.3538145124912262, | |
| "eval_runtime": 1.1107, | |
| "eval_samples_per_second": 99.936, | |
| "eval_steps_per_second": 12.605, | |
| "num_input_tokens_seen": 1321408, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 27.142857142857142, | |
| "grad_norm": 0.21413478255271912, | |
| "learning_rate": 3.1420382822767323e-06, | |
| "loss": 0.3382, | |
| "num_input_tokens_seen": 1329024, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 27.321428571428573, | |
| "grad_norm": 0.08071761578321457, | |
| "learning_rate": 2.7696171113326396e-06, | |
| "loss": 0.3531, | |
| "num_input_tokens_seen": 1337984, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 27.5, | |
| "grad_norm": 0.20051950216293335, | |
| "learning_rate": 2.420054416819556e-06, | |
| "loss": 0.3338, | |
| "num_input_tokens_seen": 1346112, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 27.678571428571427, | |
| "grad_norm": 0.05100846663117409, | |
| "learning_rate": 2.093519379466602e-06, | |
| "loss": 0.342, | |
| "num_input_tokens_seen": 1355584, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 27.857142857142858, | |
| "grad_norm": 0.11974883079528809, | |
| "learning_rate": 1.7901700351184659e-06, | |
| "loss": 0.3414, | |
| "num_input_tokens_seen": 1364864, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 28.035714285714285, | |
| "grad_norm": 0.0698491781949997, | |
| "learning_rate": 1.5101531982495308e-06, | |
| "loss": 0.3553, | |
| "num_input_tokens_seen": 1373400, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 28.214285714285715, | |
| "grad_norm": 0.18634414672851562, | |
| "learning_rate": 1.2536043909088191e-06, | |
| "loss": 0.349, | |
| "num_input_tokens_seen": 1382744, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 28.392857142857142, | |
| "grad_norm": 0.05743042752146721, | |
| "learning_rate": 1.0206477771303236e-06, | |
| "loss": 0.3409, | |
| "num_input_tokens_seen": 1390424, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 28.5, | |
| "eval_loss": 0.35238173604011536, | |
| "eval_runtime": 1.1779, | |
| "eval_samples_per_second": 94.233, | |
| "eval_steps_per_second": 11.885, | |
| "num_input_tokens_seen": 1394904, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 28.571428571428573, | |
| "grad_norm": 0.0811547264456749, | |
| "learning_rate": 8.113961028402894e-07, | |
| "loss": 0.3513, | |
| "num_input_tokens_seen": 1399256, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 28.75, | |
| "grad_norm": 0.06173446401953697, | |
| "learning_rate": 6.259506412906402e-07, | |
| "loss": 0.35, | |
| "num_input_tokens_seen": 1407896, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 28.928571428571427, | |
| "grad_norm": 0.05312683433294296, | |
| "learning_rate": 4.6440114404492363e-07, | |
| "loss": 0.338, | |
| "num_input_tokens_seen": 1416472, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 29.107142857142858, | |
| "grad_norm": 0.07188601791858673, | |
| "learning_rate": 3.268257975405697e-07, | |
| "loss": 0.3438, | |
| "num_input_tokens_seen": 1425120, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 29.285714285714285, | |
| "grad_norm": 0.07120722532272339, | |
| "learning_rate": 2.1329118524827662e-07, | |
| "loss": 0.3419, | |
| "num_input_tokens_seen": 1433248, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 29.464285714285715, | |
| "grad_norm": 0.12804925441741943, | |
| "learning_rate": 1.238522554470989e-07, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 1442592, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 29.642857142857142, | |
| "grad_norm": 0.26469874382019043, | |
| "learning_rate": 5.855229463068712e-08, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 1451360, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 29.821428571428573, | |
| "grad_norm": 0.21112701296806335, | |
| "learning_rate": 1.742290655755707e-08, | |
| "loss": 0.3525, | |
| "num_input_tokens_seen": 1459808, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 0.2481113076210022, | |
| "learning_rate": 4.839969555581192e-10, | |
| "loss": 0.3344, | |
| "num_input_tokens_seen": 1468632, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.3523610234260559, | |
| "eval_runtime": 1.2257, | |
| "eval_samples_per_second": 90.557, | |
| "eval_steps_per_second": 11.422, | |
| "num_input_tokens_seen": 1468632, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "num_input_tokens_seen": 1468632, | |
| "step": 840, | |
| "total_flos": 6.613183518533222e+16, | |
| "train_loss": 0.47382661629290806, | |
| "train_runtime": 311.6966, | |
| "train_samples_per_second": 42.638, | |
| "train_steps_per_second": 2.695 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 840, | |
| "num_input_tokens_seen": 1468632, | |
| "num_train_epochs": 30, | |
| "save_steps": 42, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.613183518533222e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |