| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 0, | |
| "global_step": 308, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003246753246753247, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 1e-05, | |
| "loss": 2.0334, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006493506493506494, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 9.967532467532468e-06, | |
| "loss": 2.0243, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00974025974025974, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 9.935064935064936e-06, | |
| "loss": 2.0114, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012987012987012988, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 9.902597402597403e-06, | |
| "loss": 2.0151, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.016233766233766232, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 9.87012987012987e-06, | |
| "loss": 2.0245, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01948051948051948, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 9.837662337662337e-06, | |
| "loss": 1.9562, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.022727272727272728, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 9.805194805194806e-06, | |
| "loss": 1.9699, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.025974025974025976, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 9.772727272727273e-06, | |
| "loss": 1.9238, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02922077922077922, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 9.740259740259742e-06, | |
| "loss": 2.0033, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.032467532467532464, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 9.707792207792209e-06, | |
| "loss": 1.9087, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 9.675324675324677e-06, | |
| "loss": 1.9079, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03896103896103896, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 9.642857142857144e-06, | |
| "loss": 1.8503, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04220779220779221, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 9.610389610389611e-06, | |
| "loss": 1.8655, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.045454545454545456, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 9.577922077922078e-06, | |
| "loss": 1.7169, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.048701298701298704, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 9.545454545454547e-06, | |
| "loss": 1.7672, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05194805194805195, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 9.512987012987014e-06, | |
| "loss": 1.7569, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05519480519480519, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 9.48051948051948e-06, | |
| "loss": 1.7806, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.05844155844155844, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 9.448051948051948e-06, | |
| "loss": 1.7375, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06168831168831169, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 9.415584415584416e-06, | |
| "loss": 1.7054, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06493506493506493, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 9.383116883116883e-06, | |
| "loss": 1.6782, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06818181818181818, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 9.350649350649352e-06, | |
| "loss": 1.6771, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 9.318181818181819e-06, | |
| "loss": 1.6215, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07467532467532467, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 9.285714285714288e-06, | |
| "loss": 1.6617, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07792207792207792, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 9.253246753246755e-06, | |
| "loss": 1.6159, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08116883116883117, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 9.220779220779221e-06, | |
| "loss": 1.6446, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08441558441558442, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 9.188311688311688e-06, | |
| "loss": 1.5763, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.08766233766233766, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 9.155844155844157e-06, | |
| "loss": 1.5995, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.09090909090909091, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 9.123376623376624e-06, | |
| "loss": 1.5663, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09415584415584416, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 1.5084, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09740259740259741, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.05844155844156e-06, | |
| "loss": 1.5185, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10064935064935066, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.025974025974027e-06, | |
| "loss": 1.4792, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1038961038961039, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 8.993506493506494e-06, | |
| "loss": 1.4729, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.96103896103896e-06, | |
| "loss": 1.4923, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.11038961038961038, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 1.4766, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.11363636363636363, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 8.896103896103896e-06, | |
| "loss": 1.4727, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11688311688311688, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 8.863636363636365e-06, | |
| "loss": 1.4243, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.12012987012987013, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 8.831168831168832e-06, | |
| "loss": 1.4864, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.12337662337662338, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 8.7987012987013e-06, | |
| "loss": 1.3724, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1266233766233766, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.766233766233767e-06, | |
| "loss": 1.4047, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12987012987012986, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 8.733766233766234e-06, | |
| "loss": 1.4163, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1331168831168831, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 8.701298701298701e-06, | |
| "loss": 1.4129, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.13636363636363635, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 8.66883116883117e-06, | |
| "loss": 1.4217, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1396103896103896, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 8.636363636363637e-06, | |
| "loss": 1.3776, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.25, | |
| "learning_rate": 8.603896103896104e-06, | |
| "loss": 1.3976, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.1461038961038961, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 1.288, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14935064935064934, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 8.53896103896104e-06, | |
| "loss": 1.3781, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1525974025974026, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.506493506493507e-06, | |
| "loss": 1.3561, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.15584415584415584, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 8.474025974025975e-06, | |
| "loss": 1.353, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1590909090909091, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 8.441558441558442e-06, | |
| "loss": 1.3719, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.16233766233766234, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 8.40909090909091e-06, | |
| "loss": 1.2951, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16558441558441558, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 8.376623376623378e-06, | |
| "loss": 1.33, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.16883116883116883, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 8.344155844155845e-06, | |
| "loss": 1.3565, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.17207792207792208, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 8.311688311688313e-06, | |
| "loss": 1.3314, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.17532467532467533, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 8.27922077922078e-06, | |
| "loss": 1.2724, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 8.246753246753247e-06, | |
| "loss": 1.3488, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 8.214285714285714e-06, | |
| "loss": 1.2717, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.18506493506493507, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 1.3365, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.18831168831168832, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 8.14935064935065e-06, | |
| "loss": 1.2659, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.19155844155844157, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 8.116883116883117e-06, | |
| "loss": 1.2779, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.19480519480519481, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 8.084415584415586e-06, | |
| "loss": 1.2385, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19805194805194806, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 8.051948051948052e-06, | |
| "loss": 1.2865, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2012987012987013, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 8.019480519480521e-06, | |
| "loss": 1.2365, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.20454545454545456, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 7.987012987012988e-06, | |
| "loss": 1.2773, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.2077922077922078, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 7.954545454545455e-06, | |
| "loss": 1.2287, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.21103896103896103, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 7.922077922077924e-06, | |
| "loss": 1.3063, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 7.88961038961039e-06, | |
| "loss": 1.2461, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.21753246753246752, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 7.857142857142858e-06, | |
| "loss": 1.2213, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.22077922077922077, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 7.824675324675325e-06, | |
| "loss": 1.2129, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.22402597402597402, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 7.792207792207793e-06, | |
| "loss": 1.2403, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.22727272727272727, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 7.75974025974026e-06, | |
| "loss": 1.2936, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2305194805194805, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 7.727272727272727e-06, | |
| "loss": 1.2668, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.23376623376623376, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 7.694805194805194e-06, | |
| "loss": 1.2494, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.237012987012987, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 7.662337662337663e-06, | |
| "loss": 1.2483, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.24025974025974026, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 7.62987012987013e-06, | |
| "loss": 1.216, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2435064935064935, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 7.597402597402598e-06, | |
| "loss": 1.2482, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24675324675324675, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 7.564935064935065e-06, | |
| "loss": 1.2248, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 7.532467532467533e-06, | |
| "loss": 1.2248, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2532467532467532, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 1.2317, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2564935064935065, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 7.467532467532468e-06, | |
| "loss": 1.1618, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2597402597402597, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 7.435064935064936e-06, | |
| "loss": 1.2338, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.262987012987013, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 7.402597402597404e-06, | |
| "loss": 1.1754, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2662337662337662, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 7.370129870129871e-06, | |
| "loss": 1.1861, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2694805194805195, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 7.3376623376623375e-06, | |
| "loss": 1.2176, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2727272727272727, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 7.305194805194806e-06, | |
| "loss": 1.2186, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.275974025974026, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 7.272727272727273e-06, | |
| "loss": 1.196, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2792207792207792, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 7.240259740259741e-06, | |
| "loss": 1.1587, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2824675324675325, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 7.207792207792208e-06, | |
| "loss": 1.1749, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 7.175324675324677e-06, | |
| "loss": 1.2069, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.288961038961039, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 1.1633, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.2922077922077922, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 7.1103896103896105e-06, | |
| "loss": 1.1764, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.29545454545454547, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 7.077922077922078e-06, | |
| "loss": 1.2464, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2987012987012987, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 7.045454545454546e-06, | |
| "loss": 1.1447, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.30194805194805197, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 7.012987012987014e-06, | |
| "loss": 1.1718, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3051948051948052, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 6.980519480519481e-06, | |
| "loss": 1.2126, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.30844155844155846, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 6.948051948051948e-06, | |
| "loss": 1.1897, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3116883116883117, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 6.9155844155844165e-06, | |
| "loss": 1.1618, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.31493506493506496, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 6.8831168831168835e-06, | |
| "loss": 1.1754, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3181818181818182, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 6.850649350649351e-06, | |
| "loss": 1.1434, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 6.818181818181818e-06, | |
| "loss": 1.2513, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3246753246753247, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 6.785714285714287e-06, | |
| "loss": 1.1497, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.32792207792207795, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 6.753246753246754e-06, | |
| "loss": 1.1554, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.33116883116883117, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 6.720779220779221e-06, | |
| "loss": 1.158, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3344155844155844, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 6.688311688311689e-06, | |
| "loss": 1.1723, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.33766233766233766, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 6.6558441558441565e-06, | |
| "loss": 1.1684, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3409090909090909, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 6.623376623376624e-06, | |
| "loss": 1.177, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.34415584415584416, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 6.590909090909091e-06, | |
| "loss": 1.1523, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3474025974025974, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 6.55844155844156e-06, | |
| "loss": 1.1714, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.35064935064935066, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 6.525974025974027e-06, | |
| "loss": 1.1279, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3538961038961039, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 6.493506493506494e-06, | |
| "loss": 1.1239, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 6.461038961038961e-06, | |
| "loss": 1.1359, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.36038961038961037, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 6.4285714285714295e-06, | |
| "loss": 1.1214, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 6.3961038961038964e-06, | |
| "loss": 1.152, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.36688311688311687, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 6.363636363636364e-06, | |
| "loss": 1.1635, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.37012987012987014, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.331168831168831e-06, | |
| "loss": 1.1336, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.37337662337662336, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 6.2987012987013e-06, | |
| "loss": 1.1422, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.37662337662337664, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 6.266233766233767e-06, | |
| "loss": 1.1705, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.37987012987012986, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 6.233766233766234e-06, | |
| "loss": 1.1355, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.38311688311688313, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 6.201298701298702e-06, | |
| "loss": 1.1619, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.38636363636363635, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 6.168831168831169e-06, | |
| "loss": 1.1479, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.38961038961038963, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 6.136363636363637e-06, | |
| "loss": 1.1311, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 6.103896103896104e-06, | |
| "loss": 1.1565, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.3961038961038961, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 6.071428571428571e-06, | |
| "loss": 1.1161, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.39935064935064934, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 6.03896103896104e-06, | |
| "loss": 1.0956, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.4025974025974026, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 6.006493506493507e-06, | |
| "loss": 1.1251, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.40584415584415584, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.9740259740259746e-06, | |
| "loss": 1.0747, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4090909090909091, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 5.9415584415584415e-06, | |
| "loss": 1.1191, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.41233766233766234, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 5.90909090909091e-06, | |
| "loss": 1.2045, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.4155844155844156, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 5.876623376623377e-06, | |
| "loss": 1.069, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.41883116883116883, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.844155844155844e-06, | |
| "loss": 1.0879, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.42207792207792205, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 5.811688311688313e-06, | |
| "loss": 1.1427, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4253246753246753, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 5.77922077922078e-06, | |
| "loss": 1.1301, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 5.7467532467532475e-06, | |
| "loss": 1.1258, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4318181818181818, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 1.1495, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.43506493506493504, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 5.681818181818183e-06, | |
| "loss": 1.1496, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.4383116883116883, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 5.64935064935065e-06, | |
| "loss": 1.1028, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.44155844155844154, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 5.616883116883117e-06, | |
| "loss": 1.1168, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.4448051948051948, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 5.584415584415585e-06, | |
| "loss": 1.1267, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.44805194805194803, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 5.551948051948053e-06, | |
| "loss": 1.1316, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4512987012987013, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 5.5194805194805205e-06, | |
| "loss": 1.1173, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 5.4870129870129875e-06, | |
| "loss": 1.0974, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4577922077922078, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 1.0679, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.461038961038961, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 5.422077922077923e-06, | |
| "loss": 1.1091, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 5.38961038961039e-06, | |
| "loss": 1.0882, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.4675324675324675, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 1.1029, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.4707792207792208, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 5.324675324675325e-06, | |
| "loss": 1.1069, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.474025974025974, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 5.292207792207793e-06, | |
| "loss": 1.0676, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.4772727272727273, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 5.2597402597402605e-06, | |
| "loss": 1.1534, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.4805194805194805, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 5.2272727272727274e-06, | |
| "loss": 1.0651, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.4837662337662338, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 5.194805194805194e-06, | |
| "loss": 1.0842, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.487012987012987, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 5.162337662337663e-06, | |
| "loss": 1.1023, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4902597402597403, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 5.12987012987013e-06, | |
| "loss": 1.101, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.4935064935064935, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 5.097402597402598e-06, | |
| "loss": 1.1247, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.4967532467532468, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 5.064935064935065e-06, | |
| "loss": 1.0927, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 5.0324675324675334e-06, | |
| "loss": 1.0582, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5032467532467533, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0917, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5064935064935064, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 4.967532467532468e-06, | |
| "loss": 1.0752, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5097402597402597, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 4.935064935064935e-06, | |
| "loss": 1.0834, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.512987012987013, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 4.902597402597403e-06, | |
| "loss": 1.1084, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5162337662337663, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 4.870129870129871e-06, | |
| "loss": 1.133, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 4.837662337662339e-06, | |
| "loss": 1.0941, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5227272727272727, | |
| "grad_norm": 0.1923828125, | |
| "learning_rate": 4.805194805194806e-06, | |
| "loss": 1.1157, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.525974025974026, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 4.772727272727273e-06, | |
| "loss": 1.0944, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5292207792207793, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 4.74025974025974e-06, | |
| "loss": 1.1143, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5324675324675324, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 4.707792207792208e-06, | |
| "loss": 1.0905, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 4.675324675324676e-06, | |
| "loss": 1.0939, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.538961038961039, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 4.642857142857144e-06, | |
| "loss": 1.1021, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5422077922077922, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 4.610389610389611e-06, | |
| "loss": 1.0904, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 4.5779220779220786e-06, | |
| "loss": 1.0697, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5487012987012987, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 1.1283, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.551948051948052, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 4.512987012987013e-06, | |
| "loss": 1.1235, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5551948051948052, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 4.48051948051948e-06, | |
| "loss": 1.0842, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5584415584415584, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 4.448051948051948e-06, | |
| "loss": 1.1124, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5616883116883117, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 4.415584415584416e-06, | |
| "loss": 1.0675, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.564935064935065, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 4.383116883116884e-06, | |
| "loss": 1.1342, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5681818181818182, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 4.350649350649351e-06, | |
| "loss": 1.0794, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 4.3181818181818185e-06, | |
| "loss": 1.0948, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5746753246753247, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 1.1007, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.577922077922078, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 4.253246753246753e-06, | |
| "loss": 1.0799, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5811688311688312, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 4.220779220779221e-06, | |
| "loss": 1.0965, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5844155844155844, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 4.188311688311689e-06, | |
| "loss": 1.0923, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5876623376623377, | |
| "grad_norm": 0.201171875, | |
| "learning_rate": 4.155844155844157e-06, | |
| "loss": 1.123, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5909090909090909, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 4.123376623376624e-06, | |
| "loss": 1.0988, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5941558441558441, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 4.0909090909090915e-06, | |
| "loss": 1.0861, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5974025974025974, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 4.0584415584415584e-06, | |
| "loss": 1.0714, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6006493506493507, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 4.025974025974026e-06, | |
| "loss": 1.1091, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6038961038961039, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 3.993506493506494e-06, | |
| "loss": 1.1353, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 3.961038961038962e-06, | |
| "loss": 1.0776, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6103896103896104, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 3.928571428571429e-06, | |
| "loss": 1.1316, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6136363636363636, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 3.896103896103897e-06, | |
| "loss": 1.1487, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6168831168831169, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 3.863636363636364e-06, | |
| "loss": 1.1101, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6201298701298701, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 3.831168831168831e-06, | |
| "loss": 1.0881, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6233766233766234, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 3.798701298701299e-06, | |
| "loss": 1.0814, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6266233766233766, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 3.7662337662337666e-06, | |
| "loss": 1.1335, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6298701298701299, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 3.733766233766234e-06, | |
| "loss": 1.0839, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6331168831168831, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 3.701298701298702e-06, | |
| "loss": 1.0913, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6363636363636364, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 3.6688311688311688e-06, | |
| "loss": 1.0911, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6396103896103896, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 3.6363636363636366e-06, | |
| "loss": 1.0778, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 3.603896103896104e-06, | |
| "loss": 1.0945, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6461038961038961, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.061, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6493506493506493, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 3.538961038961039e-06, | |
| "loss": 1.0516, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6525974025974026, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 3.506493506493507e-06, | |
| "loss": 1.0928, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6558441558441559, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 3.474025974025974e-06, | |
| "loss": 1.0794, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6590909090909091, | |
| "grad_norm": 0.1875, | |
| "learning_rate": 3.4415584415584418e-06, | |
| "loss": 1.1191, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6623376623376623, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 3.409090909090909e-06, | |
| "loss": 1.0733, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6655844155844156, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 3.376623376623377e-06, | |
| "loss": 1.0669, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6688311688311688, | |
| "grad_norm": 0.2001953125, | |
| "learning_rate": 3.3441558441558443e-06, | |
| "loss": 1.08, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.672077922077922, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 3.311688311688312e-06, | |
| "loss": 1.0983, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6753246753246753, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 3.27922077922078e-06, | |
| "loss": 1.112, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 3.246753246753247e-06, | |
| "loss": 1.0543, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.6818181818181818, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 3.2142857142857147e-06, | |
| "loss": 1.0607, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.685064935064935, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 3.181818181818182e-06, | |
| "loss": 1.0738, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6883116883116883, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 3.14935064935065e-06, | |
| "loss": 1.0407, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6915584415584416, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 3.116883116883117e-06, | |
| "loss": 1.0252, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.6948051948051948, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 3.0844155844155847e-06, | |
| "loss": 1.0871, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.698051948051948, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 3.051948051948052e-06, | |
| "loss": 1.0775, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7012987012987013, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 3.01948051948052e-06, | |
| "loss": 1.084, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7045454545454546, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 2.9870129870129873e-06, | |
| "loss": 1.066, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7077922077922078, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.954545454545455e-06, | |
| "loss": 1.0846, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.711038961038961, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 2.922077922077922e-06, | |
| "loss": 1.0634, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 2.88961038961039e-06, | |
| "loss": 1.105, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7175324675324676, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 1.0451, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7207792207792207, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 2.824675324675325e-06, | |
| "loss": 1.0576, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.724025974025974, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 2.7922077922077925e-06, | |
| "loss": 1.0944, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 2.7597402597402603e-06, | |
| "loss": 1.0996, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7305194805194806, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 1.041, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7337662337662337, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.694805194805195e-06, | |
| "loss": 1.0958, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.737012987012987, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 2.6623376623376624e-06, | |
| "loss": 1.0667, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7402597402597403, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.6298701298701302e-06, | |
| "loss": 1.1002, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7435064935064936, | |
| "grad_norm": 0.19921875, | |
| "learning_rate": 2.597402597402597e-06, | |
| "loss": 1.0972, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7467532467532467, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.564935064935065e-06, | |
| "loss": 1.0385, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 2.5324675324675324e-06, | |
| "loss": 1.0863, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7532467532467533, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.0833, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.7564935064935064, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 2.4675324675324676e-06, | |
| "loss": 1.0874, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7597402597402597, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 2.4350649350649354e-06, | |
| "loss": 1.0441, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.762987012987013, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 2.402597402597403e-06, | |
| "loss": 1.0442, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7662337662337663, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 2.37012987012987e-06, | |
| "loss": 1.0727, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7694805194805194, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 2.337662337662338e-06, | |
| "loss": 1.0877, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7727272727272727, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 2.3051948051948054e-06, | |
| "loss": 1.0565, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.775974025974026, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 1.0583, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7792207792207793, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 2.24025974025974e-06, | |
| "loss": 1.0821, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7824675324675324, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 2.207792207792208e-06, | |
| "loss": 1.0765, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.1753246753246753e-06, | |
| "loss": 1.0854, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.788961038961039, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 1.0565, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7922077922077922, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 2.1103896103896105e-06, | |
| "loss": 1.0505, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7954545454545454, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 2.0779220779220784e-06, | |
| "loss": 1.0483, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7987012987012987, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 2.0454545454545457e-06, | |
| "loss": 1.0696, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.801948051948052, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 2.012987012987013e-06, | |
| "loss": 1.0451, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8051948051948052, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1.980519480519481e-06, | |
| "loss": 1.106, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8084415584415584, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 1.9480519480519483e-06, | |
| "loss": 1.0546, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8116883116883117, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.9155844155844157e-06, | |
| "loss": 1.0658, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.814935064935065, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.8831168831168833e-06, | |
| "loss": 1.0873, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8181818181818182, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 1.850649350649351e-06, | |
| "loss": 1.1135, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "loss": 1.0737, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8246753246753247, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 1.0543, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.827922077922078, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 1.7532467532467535e-06, | |
| "loss": 1.071, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8311688311688312, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 1.7207792207792209e-06, | |
| "loss": 1.0642, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8344155844155844, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 1.6883116883116885e-06, | |
| "loss": 1.0647, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8376623376623377, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 1.655844155844156e-06, | |
| "loss": 1.0704, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8409090909090909, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 1.6233766233766235e-06, | |
| "loss": 1.0509, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.8441558441558441, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.590909090909091e-06, | |
| "loss": 1.1797, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8474025974025974, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1.5584415584415584e-06, | |
| "loss": 1.1052, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8506493506493507, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 1.525974025974026e-06, | |
| "loss": 1.0638, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8538961038961039, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.4935064935064936e-06, | |
| "loss": 1.0769, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.1796875, | |
| "learning_rate": 1.461038961038961e-06, | |
| "loss": 1.0803, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8603896103896104, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 1.0887, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8636363636363636, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1.3961038961038962e-06, | |
| "loss": 1.145, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8668831168831169, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 1.3636363636363636e-06, | |
| "loss": 1.0514, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8701298701298701, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 1.3311688311688312e-06, | |
| "loss": 1.0708, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8733766233766234, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 1.2987012987012986e-06, | |
| "loss": 1.0822, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.8766233766233766, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 1.2662337662337662e-06, | |
| "loss": 1.0638, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8798701298701299, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 1.2337662337662338e-06, | |
| "loss": 1.0288, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8831168831168831, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 1.2012987012987014e-06, | |
| "loss": 1.075, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8863636363636364, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.168831168831169e-06, | |
| "loss": 1.0293, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8896103896103896, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 1.1363636363636364e-06, | |
| "loss": 1.0665, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 1.103896103896104e-06, | |
| "loss": 1.0715, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8961038961038961, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1.0714285714285714e-06, | |
| "loss": 1.0327, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8993506493506493, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.0389610389610392e-06, | |
| "loss": 1.0354, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.9025974025974026, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 1.0064935064935066e-06, | |
| "loss": 1.0808, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.9058441558441559, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 9.740259740259742e-07, | |
| "loss": 1.0577, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 9.415584415584417e-07, | |
| "loss": 1.0595, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9123376623376623, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 9.090909090909091e-07, | |
| "loss": 1.0604, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9155844155844156, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 8.766233766233767e-07, | |
| "loss": 1.0966, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9188311688311688, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 8.441558441558442e-07, | |
| "loss": 1.1049, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.922077922077922, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 8.116883116883117e-07, | |
| "loss": 1.0724, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9253246753246753, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 7.792207792207792e-07, | |
| "loss": 1.0585, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 7.467532467532468e-07, | |
| "loss": 1.0773, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9318181818181818, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 1.0684, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.935064935064935, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 6.818181818181818e-07, | |
| "loss": 1.0557, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9383116883116883, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 6.493506493506493e-07, | |
| "loss": 1.0877, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.9415584415584416, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 6.168831168831169e-07, | |
| "loss": 1.054, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9448051948051948, | |
| "grad_norm": 0.1806640625, | |
| "learning_rate": 5.844155844155845e-07, | |
| "loss": 1.0529, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.948051948051948, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 5.51948051948052e-07, | |
| "loss": 1.0316, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9512987012987013, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 5.194805194805196e-07, | |
| "loss": 1.0949, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9545454545454546, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 4.870129870129871e-07, | |
| "loss": 1.0432, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.9577922077922078, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 4.5454545454545457e-07, | |
| "loss": 1.0662, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.961038961038961, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 4.220779220779221e-07, | |
| "loss": 1.0412, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 3.896103896103896e-07, | |
| "loss": 1.0525, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.9675324675324676, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 1.0804, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9707792207792207, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 3.2467532467532465e-07, | |
| "loss": 1.0486, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.974025974025974, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 2.9220779220779225e-07, | |
| "loss": 1.0435, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9772727272727273, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 2.597402597402598e-07, | |
| "loss": 1.0199, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.9805194805194806, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 2.2727272727272729e-07, | |
| "loss": 1.0881, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9837662337662337, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 1.948051948051948e-07, | |
| "loss": 1.0522, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.987012987012987, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 1.6233766233766232e-07, | |
| "loss": 1.0565, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9902597402597403, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 1.298701298701299e-07, | |
| "loss": 1.0653, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9935064935064936, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 9.74025974025974e-08, | |
| "loss": 1.0748, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9967532467532467, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 6.493506493506495e-08, | |
| "loss": 1.0708, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1591796875, | |
| "learning_rate": 3.2467532467532474e-08, | |
| "loss": 1.0435, | |
| "step": 308 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 308, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.669683963924316e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |