| { | |
| "best_metric": 0.6509745717048645, | |
| "best_model_checkpoint": "miner_id_24/checkpoint-600", | |
| "epoch": 0.31286664059444663, | |
| "eval_steps": 150, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005214444009907443, | |
| "grad_norm": 2.7253143787384033, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 1.1883, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0005214444009907443, | |
| "eval_loss": 3.5160233974456787, | |
| "eval_runtime": 325.6176, | |
| "eval_samples_per_second": 19.839, | |
| "eval_steps_per_second": 4.96, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0010428888019814887, | |
| "grad_norm": 3.386953115463257, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 1.6895, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.001564333202972233, | |
| "grad_norm": 3.0215859413146973, | |
| "learning_rate": 9.000000000000001e-07, | |
| "loss": 1.221, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0020857776039629773, | |
| "grad_norm": 3.2928130626678467, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 1.1251, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0026072220049537216, | |
| "grad_norm": 3.2181572914123535, | |
| "learning_rate": 1.5e-06, | |
| "loss": 1.3883, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.003128666405944466, | |
| "grad_norm": 3.965681552886963, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 1.9077, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0036501108069352107, | |
| "grad_norm": 3.617943048477173, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "loss": 1.883, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.004171555207925955, | |
| "grad_norm": 3.563110113143921, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 2.3753, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.004692999608916699, | |
| "grad_norm": 4.999038219451904, | |
| "learning_rate": 2.7e-06, | |
| "loss": 2.9592, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.005214444009907443, | |
| "grad_norm": 4.850180625915527, | |
| "learning_rate": 3e-06, | |
| "loss": 3.0989, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005735888410898188, | |
| "grad_norm": 4.822148323059082, | |
| "learning_rate": 3.3e-06, | |
| "loss": 2.5808, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.006257332811888932, | |
| "grad_norm": 4.977088451385498, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 3.3742, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.006778777212879676, | |
| "grad_norm": 4.140382766723633, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 2.5006, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0073002216138704215, | |
| "grad_norm": 5.024703502655029, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 2.7223, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.007821666014861166, | |
| "grad_norm": 4.6218976974487305, | |
| "learning_rate": 4.5e-06, | |
| "loss": 2.9669, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.00834311041585191, | |
| "grad_norm": 4.481057643890381, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 2.8808, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.008864554816842654, | |
| "grad_norm": 5.046372413635254, | |
| "learning_rate": 5.1e-06, | |
| "loss": 3.3685, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.009385999217833398, | |
| "grad_norm": 5.197815895080566, | |
| "learning_rate": 5.4e-06, | |
| "loss": 3.3641, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.009907443618824143, | |
| "grad_norm": 5.589998245239258, | |
| "learning_rate": 5.7000000000000005e-06, | |
| "loss": 3.2387, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.010428888019814887, | |
| "grad_norm": 4.643375873565674, | |
| "learning_rate": 6e-06, | |
| "loss": 2.847, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010950332420805632, | |
| "grad_norm": 5.727187156677246, | |
| "learning_rate": 6.300000000000001e-06, | |
| "loss": 3.5249, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.011471776821796375, | |
| "grad_norm": 5.25192928314209, | |
| "learning_rate": 6.6e-06, | |
| "loss": 3.4112, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.01199322122278712, | |
| "grad_norm": 5.9937238693237305, | |
| "learning_rate": 6.9e-06, | |
| "loss": 3.3331, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.012514665623777864, | |
| "grad_norm": 5.3217244148254395, | |
| "learning_rate": 7.2000000000000005e-06, | |
| "loss": 3.3351, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.013036110024768609, | |
| "grad_norm": 5.732251167297363, | |
| "learning_rate": 7.5e-06, | |
| "loss": 3.4039, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.013557554425759353, | |
| "grad_norm": 5.008046627044678, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 2.9577, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.014078998826750098, | |
| "grad_norm": 5.3130106925964355, | |
| "learning_rate": 8.1e-06, | |
| "loss": 3.1343, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.014600443227740843, | |
| "grad_norm": 4.684622764587402, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 3.0004, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.015121887628731586, | |
| "grad_norm": 4.972151279449463, | |
| "learning_rate": 8.7e-06, | |
| "loss": 2.9132, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.01564333202972233, | |
| "grad_norm": 5.852766036987305, | |
| "learning_rate": 9e-06, | |
| "loss": 3.3844, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016164776430713075, | |
| "grad_norm": 5.366592884063721, | |
| "learning_rate": 9.3e-06, | |
| "loss": 2.9811, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.01668622083170382, | |
| "grad_norm": 4.914961338043213, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 2.819, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.017207665232694565, | |
| "grad_norm": 4.8333210945129395, | |
| "learning_rate": 9.9e-06, | |
| "loss": 2.8641, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.01772910963368531, | |
| "grad_norm": 5.034109115600586, | |
| "learning_rate": 1.02e-05, | |
| "loss": 2.8783, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.018250554034676052, | |
| "grad_norm": 4.528682231903076, | |
| "learning_rate": 1.0500000000000001e-05, | |
| "loss": 2.6515, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.018771998435666796, | |
| "grad_norm": 4.572699069976807, | |
| "learning_rate": 1.08e-05, | |
| "loss": 2.6785, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.019293442836657543, | |
| "grad_norm": 6.095496654510498, | |
| "learning_rate": 1.11e-05, | |
| "loss": 2.5944, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.019814887237648286, | |
| "grad_norm": 4.699917316436768, | |
| "learning_rate": 1.1400000000000001e-05, | |
| "loss": 2.4965, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.02033633163863903, | |
| "grad_norm": 4.598426818847656, | |
| "learning_rate": 1.1700000000000001e-05, | |
| "loss": 2.1656, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.020857776039629773, | |
| "grad_norm": 4.476649284362793, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.0975, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02137922044062052, | |
| "grad_norm": 5.0952653884887695, | |
| "learning_rate": 1.23e-05, | |
| "loss": 2.349, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.021900664841611264, | |
| "grad_norm": 4.58736515045166, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 2.0085, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.022422109242602007, | |
| "grad_norm": 5.128681659698486, | |
| "learning_rate": 1.2900000000000002e-05, | |
| "loss": 2.0787, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.02294355364359275, | |
| "grad_norm": 4.277550220489502, | |
| "learning_rate": 1.32e-05, | |
| "loss": 1.9801, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.023464998044583497, | |
| "grad_norm": 4.380857944488525, | |
| "learning_rate": 1.3500000000000001e-05, | |
| "loss": 1.8278, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02398644244557424, | |
| "grad_norm": 4.337310314178467, | |
| "learning_rate": 1.38e-05, | |
| "loss": 1.8447, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.024507886846564984, | |
| "grad_norm": 4.444604396820068, | |
| "learning_rate": 1.4100000000000002e-05, | |
| "loss": 1.7982, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.025029331247555728, | |
| "grad_norm": 3.821458339691162, | |
| "learning_rate": 1.4400000000000001e-05, | |
| "loss": 1.5049, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.025550775648546475, | |
| "grad_norm": 4.789224624633789, | |
| "learning_rate": 1.47e-05, | |
| "loss": 1.7945, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.026072220049537218, | |
| "grad_norm": 4.952455520629883, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.9581, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02659366445052796, | |
| "grad_norm": 1.6376649141311646, | |
| "learning_rate": 1.5300000000000003e-05, | |
| "loss": 0.6677, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.027115108851518705, | |
| "grad_norm": 1.824238657951355, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 0.7885, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.027636553252509452, | |
| "grad_norm": 1.5552136898040771, | |
| "learning_rate": 1.59e-05, | |
| "loss": 0.4647, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.028157997653500196, | |
| "grad_norm": 1.6044234037399292, | |
| "learning_rate": 1.62e-05, | |
| "loss": 0.4944, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.02867944205449094, | |
| "grad_norm": 1.4595534801483154, | |
| "learning_rate": 1.65e-05, | |
| "loss": 0.5307, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.029200886455481686, | |
| "grad_norm": 1.7148545980453491, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.7493, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.02972233085647243, | |
| "grad_norm": 1.7278859615325928, | |
| "learning_rate": 1.7100000000000002e-05, | |
| "loss": 0.7113, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.030243775257463173, | |
| "grad_norm": 2.3812315464019775, | |
| "learning_rate": 1.74e-05, | |
| "loss": 0.9793, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.030765219658453916, | |
| "grad_norm": 2.3127753734588623, | |
| "learning_rate": 1.77e-05, | |
| "loss": 1.1053, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.03128666405944466, | |
| "grad_norm": 1.7662042379379272, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.7896, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03180810846043541, | |
| "grad_norm": 2.224780797958374, | |
| "learning_rate": 1.83e-05, | |
| "loss": 0.9967, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.03232955286142615, | |
| "grad_norm": 2.4859564304351807, | |
| "learning_rate": 1.86e-05, | |
| "loss": 1.0838, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.032850997262416894, | |
| "grad_norm": 1.620069146156311, | |
| "learning_rate": 1.8900000000000002e-05, | |
| "loss": 0.7138, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.03337244166340764, | |
| "grad_norm": 2.2818684577941895, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.9581, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.03389388606439838, | |
| "grad_norm": 1.9163058996200562, | |
| "learning_rate": 1.9500000000000003e-05, | |
| "loss": 0.916, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03441533046538913, | |
| "grad_norm": 1.9157963991165161, | |
| "learning_rate": 1.98e-05, | |
| "loss": 0.8967, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.034936774866379874, | |
| "grad_norm": 2.1430623531341553, | |
| "learning_rate": 2.01e-05, | |
| "loss": 1.025, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.03545821926737062, | |
| "grad_norm": 2.3148794174194336, | |
| "learning_rate": 2.04e-05, | |
| "loss": 1.0084, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.03597966366836136, | |
| "grad_norm": 1.9496533870697021, | |
| "learning_rate": 2.0700000000000002e-05, | |
| "loss": 0.9664, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.036501108069352105, | |
| "grad_norm": 2.164203405380249, | |
| "learning_rate": 2.1000000000000002e-05, | |
| "loss": 1.0243, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03702255247034285, | |
| "grad_norm": 2.3370673656463623, | |
| "learning_rate": 2.1300000000000003e-05, | |
| "loss": 0.9201, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.03754399687133359, | |
| "grad_norm": 2.2651474475860596, | |
| "learning_rate": 2.16e-05, | |
| "loss": 1.0019, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.038065441272324335, | |
| "grad_norm": 2.0069406032562256, | |
| "learning_rate": 2.1900000000000004e-05, | |
| "loss": 0.8822, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.038586885673315086, | |
| "grad_norm": 1.717893123626709, | |
| "learning_rate": 2.22e-05, | |
| "loss": 0.8106, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.03910833007430583, | |
| "grad_norm": 1.975976586341858, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.8286, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03962977447529657, | |
| "grad_norm": 1.7399530410766602, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 0.7989, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.040151218876287316, | |
| "grad_norm": 1.8633719682693481, | |
| "learning_rate": 2.31e-05, | |
| "loss": 0.8008, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.04067266327727806, | |
| "grad_norm": 1.90866219997406, | |
| "learning_rate": 2.3400000000000003e-05, | |
| "loss": 0.9197, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.0411941076782688, | |
| "grad_norm": 2.1896774768829346, | |
| "learning_rate": 2.37e-05, | |
| "loss": 1.1467, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.041715552079259546, | |
| "grad_norm": 1.8817816972732544, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.8939, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0422369964802503, | |
| "grad_norm": 2.845698118209839, | |
| "learning_rate": 2.4300000000000005e-05, | |
| "loss": 1.0723, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.04275844088124104, | |
| "grad_norm": 2.0808112621307373, | |
| "learning_rate": 2.46e-05, | |
| "loss": 0.8704, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.043279885282231784, | |
| "grad_norm": 1.8603535890579224, | |
| "learning_rate": 2.4900000000000002e-05, | |
| "loss": 0.9147, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.04380132968322253, | |
| "grad_norm": 2.1939947605133057, | |
| "learning_rate": 2.5200000000000003e-05, | |
| "loss": 0.9729, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.04432277408421327, | |
| "grad_norm": 2.237332344055176, | |
| "learning_rate": 2.55e-05, | |
| "loss": 0.8685, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.044844218485204014, | |
| "grad_norm": 1.8912482261657715, | |
| "learning_rate": 2.5800000000000004e-05, | |
| "loss": 0.9319, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.04536566288619476, | |
| "grad_norm": 2.4605658054351807, | |
| "learning_rate": 2.61e-05, | |
| "loss": 0.9852, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.0458871072871855, | |
| "grad_norm": 2.3050081729888916, | |
| "learning_rate": 2.64e-05, | |
| "loss": 0.9648, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.04640855168817625, | |
| "grad_norm": 2.2562782764434814, | |
| "learning_rate": 2.6700000000000005e-05, | |
| "loss": 0.9906, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.046929996089166995, | |
| "grad_norm": 2.136019468307495, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.9215, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04745144049015774, | |
| "grad_norm": 2.065573215484619, | |
| "learning_rate": 2.7300000000000003e-05, | |
| "loss": 1.0027, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.04797288489114848, | |
| "grad_norm": 2.3429982662200928, | |
| "learning_rate": 2.76e-05, | |
| "loss": 1.0135, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.048494329292139225, | |
| "grad_norm": 2.2213521003723145, | |
| "learning_rate": 2.79e-05, | |
| "loss": 0.9772, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.04901577369312997, | |
| "grad_norm": 2.0035593509674072, | |
| "learning_rate": 2.8200000000000004e-05, | |
| "loss": 0.8933, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.04953721809412071, | |
| "grad_norm": 2.8717164993286133, | |
| "learning_rate": 2.85e-05, | |
| "loss": 1.164, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.050058662495111456, | |
| "grad_norm": 2.441497325897217, | |
| "learning_rate": 2.8800000000000002e-05, | |
| "loss": 1.0731, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.050580106896102206, | |
| "grad_norm": 2.3798351287841797, | |
| "learning_rate": 2.91e-05, | |
| "loss": 1.0519, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.05110155129709295, | |
| "grad_norm": 2.8075406551361084, | |
| "learning_rate": 2.94e-05, | |
| "loss": 1.1309, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.05162299569808369, | |
| "grad_norm": 3.4404821395874023, | |
| "learning_rate": 2.9700000000000004e-05, | |
| "loss": 1.1844, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.052144440099074436, | |
| "grad_norm": 3.207118272781372, | |
| "learning_rate": 3e-05, | |
| "loss": 1.0556, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05266588450006518, | |
| "grad_norm": 1.896147608757019, | |
| "learning_rate": 3.03e-05, | |
| "loss": 0.5913, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.05318732890105592, | |
| "grad_norm": 1.304902195930481, | |
| "learning_rate": 3.0600000000000005e-05, | |
| "loss": 0.4833, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.05370877330204667, | |
| "grad_norm": 0.9159135222434998, | |
| "learning_rate": 3.09e-05, | |
| "loss": 0.2707, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.05423021770303741, | |
| "grad_norm": 0.8160455822944641, | |
| "learning_rate": 3.1200000000000006e-05, | |
| "loss": 0.252, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.05475166210402816, | |
| "grad_norm": 0.8945296406745911, | |
| "learning_rate": 3.15e-05, | |
| "loss": 0.3627, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.055273106505018904, | |
| "grad_norm": 1.5228551626205444, | |
| "learning_rate": 3.18e-05, | |
| "loss": 0.5862, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.05579455090600965, | |
| "grad_norm": 1.3707317113876343, | |
| "learning_rate": 3.21e-05, | |
| "loss": 0.5722, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.05631599530700039, | |
| "grad_norm": 1.5994707345962524, | |
| "learning_rate": 3.24e-05, | |
| "loss": 0.7169, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.056837439707991134, | |
| "grad_norm": 1.4748954772949219, | |
| "learning_rate": 3.27e-05, | |
| "loss": 0.6509, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.05735888410898188, | |
| "grad_norm": 1.50614595413208, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.7559, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05788032850997262, | |
| "grad_norm": 1.55193293094635, | |
| "learning_rate": 3.33e-05, | |
| "loss": 0.824, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.05840177291096337, | |
| "grad_norm": 1.3997102975845337, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 0.6819, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.058923217311954115, | |
| "grad_norm": 1.7437316179275513, | |
| "learning_rate": 3.39e-05, | |
| "loss": 0.7038, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.05944466171294486, | |
| "grad_norm": 1.4148025512695312, | |
| "learning_rate": 3.4200000000000005e-05, | |
| "loss": 0.7393, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.0599661061139356, | |
| "grad_norm": 1.8449006080627441, | |
| "learning_rate": 3.4500000000000005e-05, | |
| "loss": 0.834, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.060487550514926346, | |
| "grad_norm": 2.5175585746765137, | |
| "learning_rate": 3.48e-05, | |
| "loss": 0.689, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.06100899491591709, | |
| "grad_norm": 1.7341601848602295, | |
| "learning_rate": 3.5100000000000006e-05, | |
| "loss": 0.767, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.06153043931690783, | |
| "grad_norm": 1.4200981855392456, | |
| "learning_rate": 3.54e-05, | |
| "loss": 0.6371, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.062051883717898576, | |
| "grad_norm": 1.6563234329223633, | |
| "learning_rate": 3.57e-05, | |
| "loss": 0.7777, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.06257332811888933, | |
| "grad_norm": 1.6099658012390137, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.6769, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06309477251988006, | |
| "grad_norm": 1.732650637626648, | |
| "learning_rate": 3.63e-05, | |
| "loss": 0.9144, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.06361621692087081, | |
| "grad_norm": 1.4773802757263184, | |
| "learning_rate": 3.66e-05, | |
| "loss": 0.6909, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.06413766132186155, | |
| "grad_norm": 1.7683815956115723, | |
| "learning_rate": 3.69e-05, | |
| "loss": 0.8182, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.0646591057228523, | |
| "grad_norm": 1.900134563446045, | |
| "learning_rate": 3.72e-05, | |
| "loss": 0.834, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.06518055012384305, | |
| "grad_norm": 2.2528598308563232, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.9006, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.06570199452483379, | |
| "grad_norm": 1.7022476196289062, | |
| "learning_rate": 3.7800000000000004e-05, | |
| "loss": 0.763, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.06622343892582454, | |
| "grad_norm": 2.2806150913238525, | |
| "learning_rate": 3.8100000000000005e-05, | |
| "loss": 0.968, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.06674488332681527, | |
| "grad_norm": 1.5035136938095093, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 0.7268, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.06726632772780602, | |
| "grad_norm": 1.94576096534729, | |
| "learning_rate": 3.87e-05, | |
| "loss": 0.7615, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.06778777212879676, | |
| "grad_norm": 1.336945652961731, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.6875, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.06830921652978751, | |
| "grad_norm": 1.500450849533081, | |
| "learning_rate": 3.93e-05, | |
| "loss": 0.7817, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.06883066093077826, | |
| "grad_norm": 1.852668046951294, | |
| "learning_rate": 3.96e-05, | |
| "loss": 0.8267, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.069352105331769, | |
| "grad_norm": 1.9518615007400513, | |
| "learning_rate": 3.990000000000001e-05, | |
| "loss": 0.897, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.06987354973275975, | |
| "grad_norm": 1.9608622789382935, | |
| "learning_rate": 4.02e-05, | |
| "loss": 0.956, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.07039499413375049, | |
| "grad_norm": 1.6532974243164062, | |
| "learning_rate": 4.05e-05, | |
| "loss": 0.8116, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07091643853474124, | |
| "grad_norm": 1.9795348644256592, | |
| "learning_rate": 4.08e-05, | |
| "loss": 0.9266, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.07143788293573197, | |
| "grad_norm": 1.6830356121063232, | |
| "learning_rate": 4.11e-05, | |
| "loss": 0.7564, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.07195932733672272, | |
| "grad_norm": 2.111184597015381, | |
| "learning_rate": 4.1400000000000003e-05, | |
| "loss": 0.9741, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.07248077173771347, | |
| "grad_norm": 2.1398720741271973, | |
| "learning_rate": 4.1700000000000004e-05, | |
| "loss": 0.9081, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.07300221613870421, | |
| "grad_norm": 1.9623899459838867, | |
| "learning_rate": 4.2000000000000004e-05, | |
| "loss": 0.8844, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07352366053969496, | |
| "grad_norm": 2.0776987075805664, | |
| "learning_rate": 4.23e-05, | |
| "loss": 0.9306, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.0740451049406857, | |
| "grad_norm": 2.0103209018707275, | |
| "learning_rate": 4.2600000000000005e-05, | |
| "loss": 0.8967, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.07456654934167645, | |
| "grad_norm": 1.9392935037612915, | |
| "learning_rate": 4.2900000000000006e-05, | |
| "loss": 0.8938, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.07508799374266718, | |
| "grad_norm": 2.041646957397461, | |
| "learning_rate": 4.32e-05, | |
| "loss": 0.9007, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.07560943814365793, | |
| "grad_norm": 2.416090726852417, | |
| "learning_rate": 4.35e-05, | |
| "loss": 1.096, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.07613088254464867, | |
| "grad_norm": 2.0825910568237305, | |
| "learning_rate": 4.380000000000001e-05, | |
| "loss": 1.0315, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.07665232694563942, | |
| "grad_norm": 3.348813533782959, | |
| "learning_rate": 4.41e-05, | |
| "loss": 0.9972, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.07717377134663017, | |
| "grad_norm": 3.285196304321289, | |
| "learning_rate": 4.44e-05, | |
| "loss": 1.051, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.07769521574762091, | |
| "grad_norm": 2.2454581260681152, | |
| "learning_rate": 4.47e-05, | |
| "loss": 0.9835, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.07821666014861166, | |
| "grad_norm": 2.6450259685516357, | |
| "learning_rate": 4.5e-05, | |
| "loss": 1.0683, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.07821666014861166, | |
| "eval_loss": 0.798732578754425, | |
| "eval_runtime": 326.8195, | |
| "eval_samples_per_second": 19.766, | |
| "eval_steps_per_second": 4.942, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0787381045496024, | |
| "grad_norm": 1.49418306350708, | |
| "learning_rate": 4.5299999999999997e-05, | |
| "loss": 0.4889, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.07925954895059314, | |
| "grad_norm": 1.3698493242263794, | |
| "learning_rate": 4.5600000000000004e-05, | |
| "loss": 0.5986, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.07978099335158388, | |
| "grad_norm": 0.803355872631073, | |
| "learning_rate": 4.5900000000000004e-05, | |
| "loss": 0.2633, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.08030243775257463, | |
| "grad_norm": 0.6260421276092529, | |
| "learning_rate": 4.62e-05, | |
| "loss": 0.242, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.08082388215356538, | |
| "grad_norm": 0.6748026013374329, | |
| "learning_rate": 4.6500000000000005e-05, | |
| "loss": 0.3132, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.08134532655455612, | |
| "grad_norm": 1.0539573431015015, | |
| "learning_rate": 4.6800000000000006e-05, | |
| "loss": 0.4188, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.08186677095554687, | |
| "grad_norm": 1.2083957195281982, | |
| "learning_rate": 4.71e-05, | |
| "loss": 0.4227, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.0823882153565376, | |
| "grad_norm": 1.3534812927246094, | |
| "learning_rate": 4.74e-05, | |
| "loss": 0.7787, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.08290965975752836, | |
| "grad_norm": 1.4358162879943848, | |
| "learning_rate": 4.770000000000001e-05, | |
| "loss": 0.6399, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.08343110415851909, | |
| "grad_norm": 1.3026161193847656, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.7389, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08395254855950984, | |
| "grad_norm": 1.1461225748062134, | |
| "learning_rate": 4.83e-05, | |
| "loss": 0.6116, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.0844739929605006, | |
| "grad_norm": 1.6268322467803955, | |
| "learning_rate": 4.860000000000001e-05, | |
| "loss": 0.873, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.08499543736149133, | |
| "grad_norm": 1.3255441188812256, | |
| "learning_rate": 4.89e-05, | |
| "loss": 0.6859, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.08551688176248208, | |
| "grad_norm": 1.3837733268737793, | |
| "learning_rate": 4.92e-05, | |
| "loss": 0.7706, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.08603832616347282, | |
| "grad_norm": 1.6087336540222168, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 0.757, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.08655977056446357, | |
| "grad_norm": 1.297447681427002, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 0.6959, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.0870812149654543, | |
| "grad_norm": 1.6905202865600586, | |
| "learning_rate": 5.01e-05, | |
| "loss": 0.7988, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.08760265936644505, | |
| "grad_norm": 1.275194764137268, | |
| "learning_rate": 5.0400000000000005e-05, | |
| "loss": 0.6706, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.08812410376743579, | |
| "grad_norm": 1.92887282371521, | |
| "learning_rate": 5.0700000000000006e-05, | |
| "loss": 0.9816, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.08864554816842654, | |
| "grad_norm": 1.59506356716156, | |
| "learning_rate": 5.1e-05, | |
| "loss": 0.7933, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.08916699256941729, | |
| "grad_norm": 1.4300225973129272, | |
| "learning_rate": 5.13e-05, | |
| "loss": 0.672, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.08968843697040803, | |
| "grad_norm": 1.4769190549850464, | |
| "learning_rate": 5.160000000000001e-05, | |
| "loss": 0.7425, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.09020988137139878, | |
| "grad_norm": 1.579256296157837, | |
| "learning_rate": 5.19e-05, | |
| "loss": 0.8161, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.09073132577238952, | |
| "grad_norm": 1.493124008178711, | |
| "learning_rate": 5.22e-05, | |
| "loss": 0.7736, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.09125277017338027, | |
| "grad_norm": 1.3196746110916138, | |
| "learning_rate": 5.250000000000001e-05, | |
| "loss": 0.7727, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.091774214574371, | |
| "grad_norm": 1.5161240100860596, | |
| "learning_rate": 5.28e-05, | |
| "loss": 0.8462, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.09229565897536175, | |
| "grad_norm": 1.5585005283355713, | |
| "learning_rate": 5.31e-05, | |
| "loss": 0.7596, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.0928171033763525, | |
| "grad_norm": 1.3152117729187012, | |
| "learning_rate": 5.340000000000001e-05, | |
| "loss": 0.6694, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.09333854777734324, | |
| "grad_norm": 1.369708776473999, | |
| "learning_rate": 5.3700000000000004e-05, | |
| "loss": 0.744, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.09385999217833399, | |
| "grad_norm": 1.481123924255371, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 0.8273, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09438143657932473, | |
| "grad_norm": 1.6673870086669922, | |
| "learning_rate": 5.4300000000000005e-05, | |
| "loss": 0.9154, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.09490288098031548, | |
| "grad_norm": 1.506094217300415, | |
| "learning_rate": 5.4600000000000006e-05, | |
| "loss": 0.7489, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.09542432538130621, | |
| "grad_norm": 1.2892239093780518, | |
| "learning_rate": 5.49e-05, | |
| "loss": 0.6577, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.09594576978229696, | |
| "grad_norm": 1.4567745923995972, | |
| "learning_rate": 5.52e-05, | |
| "loss": 0.6866, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.0964672141832877, | |
| "grad_norm": 1.7683115005493164, | |
| "learning_rate": 5.550000000000001e-05, | |
| "loss": 0.8814, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.09698865858427845, | |
| "grad_norm": 1.5696678161621094, | |
| "learning_rate": 5.58e-05, | |
| "loss": 0.7922, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.0975101029852692, | |
| "grad_norm": 1.6422466039657593, | |
| "learning_rate": 5.61e-05, | |
| "loss": 0.856, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.09803154738625994, | |
| "grad_norm": 1.8678178787231445, | |
| "learning_rate": 5.640000000000001e-05, | |
| "loss": 0.9425, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.09855299178725069, | |
| "grad_norm": 1.8227742910385132, | |
| "learning_rate": 5.67e-05, | |
| "loss": 0.9673, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.09907443618824142, | |
| "grad_norm": 2.0066490173339844, | |
| "learning_rate": 5.7e-05, | |
| "loss": 0.8352, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.09959588058923217, | |
| "grad_norm": 2.3402273654937744, | |
| "learning_rate": 5.730000000000001e-05, | |
| "loss": 1.0273, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.10011732499022291, | |
| "grad_norm": 1.6430705785751343, | |
| "learning_rate": 5.7600000000000004e-05, | |
| "loss": 0.9703, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.10063876939121366, | |
| "grad_norm": 1.6317139863967896, | |
| "learning_rate": 5.7900000000000005e-05, | |
| "loss": 0.9602, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.10116021379220441, | |
| "grad_norm": 2.0906260013580322, | |
| "learning_rate": 5.82e-05, | |
| "loss": 1.0714, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.10168165819319515, | |
| "grad_norm": 1.8493003845214844, | |
| "learning_rate": 5.8500000000000006e-05, | |
| "loss": 0.8996, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.1022031025941859, | |
| "grad_norm": 1.9314156770706177, | |
| "learning_rate": 5.88e-05, | |
| "loss": 0.8453, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.10272454699517664, | |
| "grad_norm": 2.586416244506836, | |
| "learning_rate": 5.91e-05, | |
| "loss": 0.9768, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.10324599139616739, | |
| "grad_norm": 2.3081066608428955, | |
| "learning_rate": 5.940000000000001e-05, | |
| "loss": 1.1049, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.10376743579715812, | |
| "grad_norm": 1.8214553594589233, | |
| "learning_rate": 5.97e-05, | |
| "loss": 0.9555, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.10428888019814887, | |
| "grad_norm": 2.6920111179351807, | |
| "learning_rate": 6e-05, | |
| "loss": 1.1088, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10481032459913962, | |
| "grad_norm": 1.3093756437301636, | |
| "learning_rate": 6.030000000000001e-05, | |
| "loss": 0.4705, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.10533176900013036, | |
| "grad_norm": 1.7705165147781372, | |
| "learning_rate": 6.06e-05, | |
| "loss": 0.701, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.10585321340112111, | |
| "grad_norm": 0.6416582465171814, | |
| "learning_rate": 6.09e-05, | |
| "loss": 0.3082, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.10637465780211185, | |
| "grad_norm": 0.7174233794212341, | |
| "learning_rate": 6.120000000000001e-05, | |
| "loss": 0.2373, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.1068961022031026, | |
| "grad_norm": 0.8111220002174377, | |
| "learning_rate": 6.15e-05, | |
| "loss": 0.2997, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.10741754660409333, | |
| "grad_norm": 0.9542537331581116, | |
| "learning_rate": 6.18e-05, | |
| "loss": 0.4703, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.10793899100508408, | |
| "grad_norm": 1.1576972007751465, | |
| "learning_rate": 6.21e-05, | |
| "loss": 0.5591, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.10846043540607482, | |
| "grad_norm": 1.1840004920959473, | |
| "learning_rate": 6.240000000000001e-05, | |
| "loss": 0.6656, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.10898187980706557, | |
| "grad_norm": 1.0436705350875854, | |
| "learning_rate": 6.27e-05, | |
| "loss": 0.5518, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.10950332420805632, | |
| "grad_norm": 1.2548532485961914, | |
| "learning_rate": 6.3e-05, | |
| "loss": 0.6519, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11002476860904706, | |
| "grad_norm": 1.2066162824630737, | |
| "learning_rate": 6.330000000000001e-05, | |
| "loss": 0.6456, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.11054621301003781, | |
| "grad_norm": 1.4366339445114136, | |
| "learning_rate": 6.36e-05, | |
| "loss": 0.6932, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.11106765741102854, | |
| "grad_norm": 1.3649702072143555, | |
| "learning_rate": 6.39e-05, | |
| "loss": 0.8679, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.1115891018120193, | |
| "grad_norm": 1.5654956102371216, | |
| "learning_rate": 6.42e-05, | |
| "loss": 0.756, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.11211054621301003, | |
| "grad_norm": 1.8356409072875977, | |
| "learning_rate": 6.450000000000001e-05, | |
| "loss": 0.812, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.11263199061400078, | |
| "grad_norm": 1.1078040599822998, | |
| "learning_rate": 6.48e-05, | |
| "loss": 0.6616, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.11315343501499153, | |
| "grad_norm": 1.1788630485534668, | |
| "learning_rate": 6.510000000000001e-05, | |
| "loss": 0.6678, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.11367487941598227, | |
| "grad_norm": 1.3031374216079712, | |
| "learning_rate": 6.54e-05, | |
| "loss": 0.8129, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.11419632381697302, | |
| "grad_norm": 1.143462896347046, | |
| "learning_rate": 6.57e-05, | |
| "loss": 0.6625, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.11471776821796376, | |
| "grad_norm": 1.7338825464248657, | |
| "learning_rate": 6.6e-05, | |
| "loss": 0.6778, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1152392126189545, | |
| "grad_norm": 1.1185352802276611, | |
| "learning_rate": 6.630000000000001e-05, | |
| "loss": 0.6101, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.11576065701994524, | |
| "grad_norm": 1.209920883178711, | |
| "learning_rate": 6.66e-05, | |
| "loss": 0.655, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.116282101420936, | |
| "grad_norm": 1.3095269203186035, | |
| "learning_rate": 6.69e-05, | |
| "loss": 0.7194, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.11680354582192674, | |
| "grad_norm": 1.419487476348877, | |
| "learning_rate": 6.720000000000001e-05, | |
| "loss": 0.7749, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.11732499022291748, | |
| "grad_norm": 1.5805034637451172, | |
| "learning_rate": 6.75e-05, | |
| "loss": 0.8271, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.11784643462390823, | |
| "grad_norm": 1.448123812675476, | |
| "learning_rate": 6.78e-05, | |
| "loss": 0.821, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.11836787902489897, | |
| "grad_norm": 1.3406933546066284, | |
| "learning_rate": 6.81e-05, | |
| "loss": 0.7887, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.11888932342588972, | |
| "grad_norm": 1.478799819946289, | |
| "learning_rate": 6.840000000000001e-05, | |
| "loss": 0.8718, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.11941076782688045, | |
| "grad_norm": 1.4787107706069946, | |
| "learning_rate": 6.87e-05, | |
| "loss": 0.7741, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.1199322122278712, | |
| "grad_norm": 1.3887667655944824, | |
| "learning_rate": 6.900000000000001e-05, | |
| "loss": 0.7922, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12045365662886194, | |
| "grad_norm": 1.1378411054611206, | |
| "learning_rate": 6.93e-05, | |
| "loss": 0.7046, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.12097510102985269, | |
| "grad_norm": 1.34485924243927, | |
| "learning_rate": 6.96e-05, | |
| "loss": 0.7417, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.12149654543084344, | |
| "grad_norm": 1.3890823125839233, | |
| "learning_rate": 6.99e-05, | |
| "loss": 0.7552, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.12201798983183418, | |
| "grad_norm": 1.3711472749710083, | |
| "learning_rate": 7.020000000000001e-05, | |
| "loss": 0.6796, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.12253943423282493, | |
| "grad_norm": 1.1514718532562256, | |
| "learning_rate": 7.05e-05, | |
| "loss": 0.6841, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.12306087863381567, | |
| "grad_norm": 1.24899423122406, | |
| "learning_rate": 7.08e-05, | |
| "loss": 0.6578, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.12358232303480642, | |
| "grad_norm": 1.4886633157730103, | |
| "learning_rate": 7.110000000000001e-05, | |
| "loss": 0.753, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.12410376743579715, | |
| "grad_norm": 1.5798087120056152, | |
| "learning_rate": 7.14e-05, | |
| "loss": 0.9315, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.1246252118367879, | |
| "grad_norm": 1.6789343357086182, | |
| "learning_rate": 7.170000000000001e-05, | |
| "loss": 0.9131, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.12514665623777865, | |
| "grad_norm": 1.7360730171203613, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.7529, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1256681006387694, | |
| "grad_norm": 1.584789752960205, | |
| "learning_rate": 7.230000000000001e-05, | |
| "loss": 0.8746, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.12618954503976013, | |
| "grad_norm": 1.3675271272659302, | |
| "learning_rate": 7.26e-05, | |
| "loss": 0.8015, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.12671098944075088, | |
| "grad_norm": 1.3785395622253418, | |
| "learning_rate": 7.290000000000001e-05, | |
| "loss": 0.8389, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.12723243384174163, | |
| "grad_norm": 1.5574450492858887, | |
| "learning_rate": 7.32e-05, | |
| "loss": 0.9246, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.12775387824273238, | |
| "grad_norm": 1.614886999130249, | |
| "learning_rate": 7.35e-05, | |
| "loss": 0.919, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1282753226437231, | |
| "grad_norm": 2.001286506652832, | |
| "learning_rate": 7.38e-05, | |
| "loss": 1.0433, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.12879676704471385, | |
| "grad_norm": 1.5525065660476685, | |
| "learning_rate": 7.410000000000001e-05, | |
| "loss": 0.8605, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.1293182114457046, | |
| "grad_norm": 2.074171304702759, | |
| "learning_rate": 7.44e-05, | |
| "loss": 1.0522, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.12983965584669535, | |
| "grad_norm": 1.8862979412078857, | |
| "learning_rate": 7.47e-05, | |
| "loss": 1.0324, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.1303611002476861, | |
| "grad_norm": 2.677232265472412, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.9479, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13088254464867682, | |
| "grad_norm": 0.9016095995903015, | |
| "learning_rate": 7.53e-05, | |
| "loss": 0.4123, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.13140398904966757, | |
| "grad_norm": 1.1711385250091553, | |
| "learning_rate": 7.560000000000001e-05, | |
| "loss": 0.611, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.13192543345065832, | |
| "grad_norm": 0.638198971748352, | |
| "learning_rate": 7.590000000000002e-05, | |
| "loss": 0.3055, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.13244687785164908, | |
| "grad_norm": 0.5108641982078552, | |
| "learning_rate": 7.620000000000001e-05, | |
| "loss": 0.2203, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.13296832225263983, | |
| "grad_norm": 0.6586244106292725, | |
| "learning_rate": 7.65e-05, | |
| "loss": 0.2306, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.13348976665363055, | |
| "grad_norm": 0.6634657382965088, | |
| "learning_rate": 7.680000000000001e-05, | |
| "loss": 0.2932, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1340112110546213, | |
| "grad_norm": 0.8705341219902039, | |
| "learning_rate": 7.71e-05, | |
| "loss": 0.4632, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.13453265545561205, | |
| "grad_norm": 0.8588047027587891, | |
| "learning_rate": 7.74e-05, | |
| "loss": 0.5355, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.1350540998566028, | |
| "grad_norm": 0.929277241230011, | |
| "learning_rate": 7.77e-05, | |
| "loss": 0.5483, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.13557554425759352, | |
| "grad_norm": 0.911803126335144, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 0.4796, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.13609698865858427, | |
| "grad_norm": 1.0001978874206543, | |
| "learning_rate": 7.83e-05, | |
| "loss": 0.5725, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.13661843305957502, | |
| "grad_norm": 1.283967137336731, | |
| "learning_rate": 7.86e-05, | |
| "loss": 0.7197, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.13713987746056577, | |
| "grad_norm": 1.0027309656143188, | |
| "learning_rate": 7.890000000000001e-05, | |
| "loss": 0.494, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.13766132186155652, | |
| "grad_norm": 1.1058956384658813, | |
| "learning_rate": 7.92e-05, | |
| "loss": 0.7252, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.13818276626254725, | |
| "grad_norm": 1.6565499305725098, | |
| "learning_rate": 7.950000000000001e-05, | |
| "loss": 0.6628, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.138704210663538, | |
| "grad_norm": 1.349332571029663, | |
| "learning_rate": 7.980000000000002e-05, | |
| "loss": 0.7285, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.13922565506452875, | |
| "grad_norm": 1.1405296325683594, | |
| "learning_rate": 8.010000000000001e-05, | |
| "loss": 0.6503, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.1397470994655195, | |
| "grad_norm": 1.5732964277267456, | |
| "learning_rate": 8.04e-05, | |
| "loss": 0.7544, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.14026854386651022, | |
| "grad_norm": 1.1660890579223633, | |
| "learning_rate": 8.07e-05, | |
| "loss": 0.6631, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.14078998826750097, | |
| "grad_norm": 1.1596927642822266, | |
| "learning_rate": 8.1e-05, | |
| "loss": 0.6831, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14131143266849172, | |
| "grad_norm": 1.1583274602890015, | |
| "learning_rate": 8.13e-05, | |
| "loss": 0.6867, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.14183287706948247, | |
| "grad_norm": 1.351112723350525, | |
| "learning_rate": 8.16e-05, | |
| "loss": 0.8328, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.14235432147047322, | |
| "grad_norm": 1.3185946941375732, | |
| "learning_rate": 8.190000000000001e-05, | |
| "loss": 0.6922, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.14287576587146394, | |
| "grad_norm": 2.4567134380340576, | |
| "learning_rate": 8.22e-05, | |
| "loss": 0.8027, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.1433972102724547, | |
| "grad_norm": 1.0922107696533203, | |
| "learning_rate": 8.25e-05, | |
| "loss": 0.6832, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.14391865467344545, | |
| "grad_norm": 1.436450719833374, | |
| "learning_rate": 8.280000000000001e-05, | |
| "loss": 0.9163, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.1444400990744362, | |
| "grad_norm": 1.120586633682251, | |
| "learning_rate": 8.31e-05, | |
| "loss": 0.5632, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.14496154347542695, | |
| "grad_norm": 1.3133465051651, | |
| "learning_rate": 8.340000000000001e-05, | |
| "loss": 0.7018, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.14548298787641767, | |
| "grad_norm": 1.1073719263076782, | |
| "learning_rate": 8.370000000000002e-05, | |
| "loss": 0.7195, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.14600443227740842, | |
| "grad_norm": 1.1922098398208618, | |
| "learning_rate": 8.400000000000001e-05, | |
| "loss": 0.7307, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.14652587667839917, | |
| "grad_norm": 1.2493211030960083, | |
| "learning_rate": 8.43e-05, | |
| "loss": 0.9021, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.14704732107938992, | |
| "grad_norm": 1.2321152687072754, | |
| "learning_rate": 8.46e-05, | |
| "loss": 0.7489, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.14756876548038064, | |
| "grad_norm": 1.1605387926101685, | |
| "learning_rate": 8.49e-05, | |
| "loss": 0.6323, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.1480902098813714, | |
| "grad_norm": 1.3138997554779053, | |
| "learning_rate": 8.520000000000001e-05, | |
| "loss": 0.6877, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.14861165428236214, | |
| "grad_norm": 1.4798433780670166, | |
| "learning_rate": 8.55e-05, | |
| "loss": 0.9398, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.1491330986833529, | |
| "grad_norm": 1.557124376296997, | |
| "learning_rate": 8.580000000000001e-05, | |
| "loss": 0.8886, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.14965454308434364, | |
| "grad_norm": 1.216688871383667, | |
| "learning_rate": 8.61e-05, | |
| "loss": 0.77, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.15017598748533437, | |
| "grad_norm": 1.1777106523513794, | |
| "learning_rate": 8.64e-05, | |
| "loss": 0.7147, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.15069743188632512, | |
| "grad_norm": 1.3893619775772095, | |
| "learning_rate": 8.67e-05, | |
| "loss": 0.8269, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.15121887628731587, | |
| "grad_norm": 1.4378941059112549, | |
| "learning_rate": 8.7e-05, | |
| "loss": 0.8203, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15174032068830662, | |
| "grad_norm": 1.389863133430481, | |
| "learning_rate": 8.730000000000001e-05, | |
| "loss": 0.8993, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.15226176508929734, | |
| "grad_norm": 1.3715370893478394, | |
| "learning_rate": 8.760000000000002e-05, | |
| "loss": 0.8406, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.1527832094902881, | |
| "grad_norm": 1.3929187059402466, | |
| "learning_rate": 8.790000000000001e-05, | |
| "loss": 0.785, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.15330465389127884, | |
| "grad_norm": 1.5916658639907837, | |
| "learning_rate": 8.82e-05, | |
| "loss": 0.9503, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.1538260982922696, | |
| "grad_norm": 1.4390002489089966, | |
| "learning_rate": 8.85e-05, | |
| "loss": 0.8936, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.15434754269326034, | |
| "grad_norm": 1.450700044631958, | |
| "learning_rate": 8.88e-05, | |
| "loss": 0.9442, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.15486898709425106, | |
| "grad_norm": 1.267523169517517, | |
| "learning_rate": 8.910000000000001e-05, | |
| "loss": 0.8459, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.15539043149524182, | |
| "grad_norm": 1.91372811794281, | |
| "learning_rate": 8.94e-05, | |
| "loss": 0.9787, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.15591187589623257, | |
| "grad_norm": 1.8635175228118896, | |
| "learning_rate": 8.970000000000001e-05, | |
| "loss": 0.8864, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.15643332029722332, | |
| "grad_norm": 2.4946999549865723, | |
| "learning_rate": 9e-05, | |
| "loss": 1.0969, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15643332029722332, | |
| "eval_loss": 0.7402325868606567, | |
| "eval_runtime": 326.3494, | |
| "eval_samples_per_second": 19.795, | |
| "eval_steps_per_second": 4.949, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15695476469821407, | |
| "grad_norm": 0.9152140617370605, | |
| "learning_rate": 8.999753262144806e-05, | |
| "loss": 0.4045, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.1574762090992048, | |
| "grad_norm": 0.6910588145256042, | |
| "learning_rate": 8.999013075636805e-05, | |
| "loss": 0.3735, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.15799765350019554, | |
| "grad_norm": 0.5631867051124573, | |
| "learning_rate": 8.997779521645793e-05, | |
| "loss": 0.2266, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.1585190979011863, | |
| "grad_norm": 0.48377081751823425, | |
| "learning_rate": 8.996052735444863e-05, | |
| "loss": 0.2233, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.15904054230217704, | |
| "grad_norm": 0.6671403646469116, | |
| "learning_rate": 8.993832906395582e-05, | |
| "loss": 0.358, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.15956198670316776, | |
| "grad_norm": 0.8236808180809021, | |
| "learning_rate": 8.991120277927223e-05, | |
| "loss": 0.4056, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.1600834311041585, | |
| "grad_norm": 1.044965386390686, | |
| "learning_rate": 8.987915147510061e-05, | |
| "loss": 0.6135, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.16060487550514926, | |
| "grad_norm": 1.1093504428863525, | |
| "learning_rate": 8.98421786662277e-05, | |
| "loss": 0.675, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.16112631990614001, | |
| "grad_norm": 1.1360151767730713, | |
| "learning_rate": 8.98002884071386e-05, | |
| "loss": 0.7139, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.16164776430713076, | |
| "grad_norm": 0.9347630739212036, | |
| "learning_rate": 8.97534852915723e-05, | |
| "loss": 0.5709, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1621692087081215, | |
| "grad_norm": 1.0603522062301636, | |
| "learning_rate": 8.970177445201784e-05, | |
| "loss": 0.6105, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.16269065310911224, | |
| "grad_norm": 1.0385960340499878, | |
| "learning_rate": 8.964516155915151e-05, | |
| "loss": 0.6678, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.163212097510103, | |
| "grad_norm": 1.2471129894256592, | |
| "learning_rate": 8.958365282121497e-05, | |
| "loss": 0.6955, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.16373354191109374, | |
| "grad_norm": 1.2143276929855347, | |
| "learning_rate": 8.951725498333449e-05, | |
| "loss": 0.6995, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.16425498631208446, | |
| "grad_norm": 1.2376823425292969, | |
| "learning_rate": 8.94459753267812e-05, | |
| "loss": 0.8036, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.1647764307130752, | |
| "grad_norm": 1.17597234249115, | |
| "learning_rate": 8.936982166817273e-05, | |
| "loss": 0.7579, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.16529787511406596, | |
| "grad_norm": 1.0826165676116943, | |
| "learning_rate": 8.928880235861588e-05, | |
| "loss": 0.6702, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.1658193195150567, | |
| "grad_norm": 1.3289791345596313, | |
| "learning_rate": 8.9202926282791e-05, | |
| "loss": 0.8136, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.16634076391604746, | |
| "grad_norm": 1.95395827293396, | |
| "learning_rate": 8.911220285797748e-05, | |
| "loss": 0.8678, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.16686220831703819, | |
| "grad_norm": 1.2291593551635742, | |
| "learning_rate": 8.901664203302126e-05, | |
| "loss": 0.7329, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.16738365271802894, | |
| "grad_norm": 1.111344337463379, | |
| "learning_rate": 8.891625428724365e-05, | |
| "loss": 0.7421, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.1679050971190197, | |
| "grad_norm": 1.2865890264511108, | |
| "learning_rate": 8.881105062929222e-05, | |
| "loss": 0.8722, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.16842654152001044, | |
| "grad_norm": 1.30595064163208, | |
| "learning_rate": 8.870104259593363e-05, | |
| "loss": 0.8715, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.1689479859210012, | |
| "grad_norm": 1.0691040754318237, | |
| "learning_rate": 8.858624225078841e-05, | |
| "loss": 0.6839, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.1694694303219919, | |
| "grad_norm": 1.2839759588241577, | |
| "learning_rate": 8.846666218300807e-05, | |
| "loss": 0.7865, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.16999087472298266, | |
| "grad_norm": 1.0126097202301025, | |
| "learning_rate": 8.834231550589462e-05, | |
| "loss": 0.6566, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.1705123191239734, | |
| "grad_norm": 1.189987301826477, | |
| "learning_rate": 8.821321585546244e-05, | |
| "loss": 0.7741, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.17103376352496416, | |
| "grad_norm": 1.129711389541626, | |
| "learning_rate": 8.807937738894302e-05, | |
| "loss": 0.6625, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.17155520792595488, | |
| "grad_norm": 1.4810699224472046, | |
| "learning_rate": 8.794081478323246e-05, | |
| "loss": 0.8111, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.17207665232694563, | |
| "grad_norm": 1.334193468093872, | |
| "learning_rate": 8.779754323328193e-05, | |
| "loss": 0.7845, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.17259809672793638, | |
| "grad_norm": 1.2970231771469116, | |
| "learning_rate": 8.764957845043137e-05, | |
| "loss": 0.7514, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.17311954112892713, | |
| "grad_norm": 1.3496140241622925, | |
| "learning_rate": 8.749693666068665e-05, | |
| "loss": 0.7859, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.17364098552991789, | |
| "grad_norm": 1.1188700199127197, | |
| "learning_rate": 8.733963460294016e-05, | |
| "loss": 0.749, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.1741624299309086, | |
| "grad_norm": 1.1531932353973389, | |
| "learning_rate": 8.717768952713513e-05, | |
| "loss": 0.7462, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.17468387433189936, | |
| "grad_norm": 1.593352198600769, | |
| "learning_rate": 8.701111919237408e-05, | |
| "loss": 0.9956, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.1752053187328901, | |
| "grad_norm": 1.2148069143295288, | |
| "learning_rate": 8.683994186497131e-05, | |
| "loss": 0.7376, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.17572676313388086, | |
| "grad_norm": 1.196800708770752, | |
| "learning_rate": 8.666417631644977e-05, | |
| "loss": 0.8357, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.17624820753487158, | |
| "grad_norm": 2.3671152591705322, | |
| "learning_rate": 8.648384182148252e-05, | |
| "loss": 0.802, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.17676965193586233, | |
| "grad_norm": 1.3539149761199951, | |
| "learning_rate": 8.629895815577916e-05, | |
| "loss": 0.7685, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.17729109633685308, | |
| "grad_norm": 1.4018131494522095, | |
| "learning_rate": 8.610954559391704e-05, | |
| "loss": 0.8006, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.17781254073784383, | |
| "grad_norm": 1.288794994354248, | |
| "learning_rate": 8.59156249071181e-05, | |
| "loss": 0.7986, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.17833398513883458, | |
| "grad_norm": 1.3666220903396606, | |
| "learning_rate": 8.571721736097089e-05, | |
| "loss": 0.8825, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.1788554295398253, | |
| "grad_norm": 1.6995041370391846, | |
| "learning_rate": 8.551434471309872e-05, | |
| "loss": 0.7511, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.17937687394081606, | |
| "grad_norm": 1.766118049621582, | |
| "learning_rate": 8.530702921077359e-05, | |
| "loss": 0.8466, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.1798983183418068, | |
| "grad_norm": 1.5338678359985352, | |
| "learning_rate": 8.509529358847655e-05, | |
| "loss": 0.8606, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.18041976274279756, | |
| "grad_norm": 1.4264774322509766, | |
| "learning_rate": 8.487916106540466e-05, | |
| "loss": 0.9154, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.1809412071437883, | |
| "grad_norm": 1.4423105716705322, | |
| "learning_rate": 8.465865534292465e-05, | |
| "loss": 0.8203, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.18146265154477903, | |
| "grad_norm": 1.6251096725463867, | |
| "learning_rate": 8.443380060197386e-05, | |
| "loss": 0.9229, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.18198409594576978, | |
| "grad_norm": 1.8085907697677612, | |
| "learning_rate": 8.420462150040853e-05, | |
| "loss": 0.9639, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.18250554034676053, | |
| "grad_norm": 1.7975590229034424, | |
| "learning_rate": 8.397114317029975e-05, | |
| "loss": 0.9465, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18302698474775128, | |
| "grad_norm": 0.7071613073348999, | |
| "learning_rate": 8.373339121517747e-05, | |
| "loss": 0.3601, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.183548429148742, | |
| "grad_norm": 0.8708938360214233, | |
| "learning_rate": 8.34913917072228e-05, | |
| "loss": 0.4189, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.18406987354973275, | |
| "grad_norm": 0.4563164710998535, | |
| "learning_rate": 8.324517118440889e-05, | |
| "loss": 0.2105, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.1845913179507235, | |
| "grad_norm": 0.4798504114151001, | |
| "learning_rate": 8.299475664759068e-05, | |
| "loss": 0.1919, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.18511276235171426, | |
| "grad_norm": 0.7082318663597107, | |
| "learning_rate": 8.274017555754409e-05, | |
| "loss": 0.3147, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.185634206752705, | |
| "grad_norm": 0.7861395478248596, | |
| "learning_rate": 8.248145583195448e-05, | |
| "loss": 0.441, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.18615565115369573, | |
| "grad_norm": 0.7495299577713013, | |
| "learning_rate": 8.221862584235528e-05, | |
| "loss": 0.4194, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.18667709555468648, | |
| "grad_norm": 1.1042672395706177, | |
| "learning_rate": 8.195171441101669e-05, | |
| "loss": 0.6837, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.18719853995567723, | |
| "grad_norm": 0.9663426280021667, | |
| "learning_rate": 8.168075080778494e-05, | |
| "loss": 0.5631, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.18771998435666798, | |
| "grad_norm": 1.1327295303344727, | |
| "learning_rate": 8.140576474687264e-05, | |
| "loss": 0.7862, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1882414287576587, | |
| "grad_norm": 1.2756463289260864, | |
| "learning_rate": 8.112678638360016e-05, | |
| "loss": 0.755, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.18876287315864945, | |
| "grad_norm": 1.0837277173995972, | |
| "learning_rate": 8.084384631108883e-05, | |
| "loss": 0.6897, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.1892843175596402, | |
| "grad_norm": 1.006117343902588, | |
| "learning_rate": 8.055697555690607e-05, | |
| "loss": 0.708, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.18980576196063095, | |
| "grad_norm": 1.0935845375061035, | |
| "learning_rate": 8.02662055796628e-05, | |
| "loss": 0.6253, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.1903272063616217, | |
| "grad_norm": 1.0223681926727295, | |
| "learning_rate": 7.99715682655637e-05, | |
| "loss": 0.6429, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.19084865076261243, | |
| "grad_norm": 1.057639718055725, | |
| "learning_rate": 7.967309592491052e-05, | |
| "loss": 0.7681, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.19137009516360318, | |
| "grad_norm": 1.2563397884368896, | |
| "learning_rate": 7.937082128855891e-05, | |
| "loss": 0.7278, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.19189153956459393, | |
| "grad_norm": 1.2409113645553589, | |
| "learning_rate": 7.906477750432904e-05, | |
| "loss": 0.6566, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.19241298396558468, | |
| "grad_norm": 1.2019901275634766, | |
| "learning_rate": 7.875499813337069e-05, | |
| "loss": 0.8036, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.1929344283665754, | |
| "grad_norm": 0.9332624077796936, | |
| "learning_rate": 7.844151714648274e-05, | |
| "loss": 0.5595, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.19345587276756615, | |
| "grad_norm": 1.1587673425674438, | |
| "learning_rate": 7.812436892038805e-05, | |
| "loss": 0.6916, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.1939773171685569, | |
| "grad_norm": 1.2590214014053345, | |
| "learning_rate": 7.780358823396352e-05, | |
| "loss": 0.7037, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.19449876156954765, | |
| "grad_norm": 1.5824557542800903, | |
| "learning_rate": 7.747921026442631e-05, | |
| "loss": 0.8385, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.1950202059705384, | |
| "grad_norm": 1.312893271446228, | |
| "learning_rate": 7.715127058347615e-05, | |
| "loss": 0.8146, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.19554165037152912, | |
| "grad_norm": 1.2608100175857544, | |
| "learning_rate": 7.681980515339464e-05, | |
| "loss": 0.7516, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.19606309477251987, | |
| "grad_norm": 1.1570591926574707, | |
| "learning_rate": 7.648485032310145e-05, | |
| "loss": 0.7294, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.19658453917351063, | |
| "grad_norm": 1.0426164865493774, | |
| "learning_rate": 7.614644282416831e-05, | |
| "loss": 0.6835, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.19710598357450138, | |
| "grad_norm": 1.0416784286499023, | |
| "learning_rate": 7.5804619766791e-05, | |
| "loss": 0.6097, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.19762742797549213, | |
| "grad_norm": 1.2640820741653442, | |
| "learning_rate": 7.545941863571974e-05, | |
| "loss": 0.8251, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.19814887237648285, | |
| "grad_norm": 1.1552680730819702, | |
| "learning_rate": 7.511087728614862e-05, | |
| "loss": 0.7257, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1986703167774736, | |
| "grad_norm": 1.2071866989135742, | |
| "learning_rate": 7.475903393956434e-05, | |
| "loss": 0.7659, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.19919176117846435, | |
| "grad_norm": 1.2022343873977661, | |
| "learning_rate": 7.440392717955476e-05, | |
| "loss": 0.7332, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.1997132055794551, | |
| "grad_norm": 1.2143396139144897, | |
| "learning_rate": 7.404559594757779e-05, | |
| "loss": 0.7158, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.20023464998044582, | |
| "grad_norm": 1.2017430067062378, | |
| "learning_rate": 7.368407953869104e-05, | |
| "loss": 0.7293, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.20075609438143657, | |
| "grad_norm": 1.0218538045883179, | |
| "learning_rate": 7.33194175972427e-05, | |
| "loss": 0.6618, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.20127753878242732, | |
| "grad_norm": 1.1736619472503662, | |
| "learning_rate": 7.295165011252397e-05, | |
| "loss": 0.7843, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.20179898318341807, | |
| "grad_norm": 1.3254735469818115, | |
| "learning_rate": 7.258081741438396e-05, | |
| "loss": 0.7968, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.20232042758440882, | |
| "grad_norm": 1.1880550384521484, | |
| "learning_rate": 7.220696016880688e-05, | |
| "loss": 0.7715, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.20284187198539955, | |
| "grad_norm": 1.1296610832214355, | |
| "learning_rate": 7.183011937345271e-05, | |
| "loss": 0.7269, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.2033633163863903, | |
| "grad_norm": 1.3146663904190063, | |
| "learning_rate": 7.14503363531613e-05, | |
| "loss": 0.8085, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.20388476078738105, | |
| "grad_norm": 1.3790757656097412, | |
| "learning_rate": 7.106765275542055e-05, | |
| "loss": 0.7678, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.2044062051883718, | |
| "grad_norm": 1.2263914346694946, | |
| "learning_rate": 7.068211054579944e-05, | |
| "loss": 0.8242, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.20492764958936252, | |
| "grad_norm": 1.492631196975708, | |
| "learning_rate": 7.029375200334588e-05, | |
| "loss": 0.9098, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.20544909399035327, | |
| "grad_norm": 1.5877786874771118, | |
| "learning_rate": 6.99026197159505e-05, | |
| "loss": 0.9134, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.20597053839134402, | |
| "grad_norm": 1.2312871217727661, | |
| "learning_rate": 6.950875657567623e-05, | |
| "loss": 0.7654, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.20649198279233477, | |
| "grad_norm": 1.3052772283554077, | |
| "learning_rate": 6.911220577405484e-05, | |
| "loss": 0.7572, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.20701342719332552, | |
| "grad_norm": 1.5820766687393188, | |
| "learning_rate": 6.87130107973505e-05, | |
| "loss": 0.9036, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.20753487159431624, | |
| "grad_norm": 1.7080368995666504, | |
| "learning_rate": 6.831121542179087e-05, | |
| "loss": 0.8461, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.208056315995307, | |
| "grad_norm": 1.7430877685546875, | |
| "learning_rate": 6.790686370876671e-05, | |
| "loss": 0.8611, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.20857776039629775, | |
| "grad_norm": 1.6613725423812866, | |
| "learning_rate": 6.75e-05, | |
| "loss": 0.9955, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2090992047972885, | |
| "grad_norm": 1.153273105621338, | |
| "learning_rate": 6.709066891268135e-05, | |
| "loss": 0.3654, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.20962064919827925, | |
| "grad_norm": 2.4963393211364746, | |
| "learning_rate": 6.667891533457719e-05, | |
| "loss": 0.508, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.21014209359926997, | |
| "grad_norm": 0.5108705759048462, | |
| "learning_rate": 6.626478441910744e-05, | |
| "loss": 0.2177, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.21066353800026072, | |
| "grad_norm": 0.3899039328098297, | |
| "learning_rate": 6.584832158039378e-05, | |
| "loss": 0.1517, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.21118498240125147, | |
| "grad_norm": 0.6260213851928711, | |
| "learning_rate": 6.542957248827961e-05, | |
| "loss": 0.289, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.21170642680224222, | |
| "grad_norm": 0.6745234727859497, | |
| "learning_rate": 6.500858306332174e-05, | |
| "loss": 0.3188, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.21222787120323294, | |
| "grad_norm": 0.6891493797302246, | |
| "learning_rate": 6.458539947175475e-05, | |
| "loss": 0.3576, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.2127493156042237, | |
| "grad_norm": 0.7363607287406921, | |
| "learning_rate": 6.416006812042828e-05, | |
| "loss": 0.45, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.21327076000521444, | |
| "grad_norm": 0.8903110027313232, | |
| "learning_rate": 6.373263565171806e-05, | |
| "loss": 0.5986, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.2137922044062052, | |
| "grad_norm": 0.8679940700531006, | |
| "learning_rate": 6.330314893841102e-05, | |
| "loss": 0.5433, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.21431364880719594, | |
| "grad_norm": 0.959732711315155, | |
| "learning_rate": 6.287165507856512e-05, | |
| "loss": 0.5715, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.21483509320818667, | |
| "grad_norm": 1.0801646709442139, | |
| "learning_rate": 6.243820139034464e-05, | |
| "loss": 0.6556, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.21535653760917742, | |
| "grad_norm": 1.0459177494049072, | |
| "learning_rate": 6.200283540683103e-05, | |
| "loss": 0.6967, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.21587798201016817, | |
| "grad_norm": 1.250126600265503, | |
| "learning_rate": 6.156560487081051e-05, | |
| "loss": 0.6739, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.21639942641115892, | |
| "grad_norm": 1.2173274755477905, | |
| "learning_rate": 6.112655772953851e-05, | |
| "loss": 0.7337, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.21692087081214964, | |
| "grad_norm": 1.1681318283081055, | |
| "learning_rate": 6.068574212948169e-05, | |
| "loss": 0.7496, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.2174423152131404, | |
| "grad_norm": 1.147112488746643, | |
| "learning_rate": 6.024320641103812e-05, | |
| "loss": 0.7061, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.21796375961413114, | |
| "grad_norm": 0.9065099358558655, | |
| "learning_rate": 5.979899910323625e-05, | |
| "loss": 0.5293, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.2184852040151219, | |
| "grad_norm": 1.4341965913772583, | |
| "learning_rate": 5.935316891841316e-05, | |
| "loss": 1.0006, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.21900664841611264, | |
| "grad_norm": 1.0058789253234863, | |
| "learning_rate": 5.890576474687264e-05, | |
| "loss": 0.6819, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.21952809281710337, | |
| "grad_norm": 1.3122864961624146, | |
| "learning_rate": 5.845683565152391e-05, | |
| "loss": 0.8433, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.22004953721809412, | |
| "grad_norm": 1.0274057388305664, | |
| "learning_rate": 5.800643086250122e-05, | |
| "loss": 0.702, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.22057098161908487, | |
| "grad_norm": 1.1476062536239624, | |
| "learning_rate": 5.7554599771765325e-05, | |
| "loss": 0.8239, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.22109242602007562, | |
| "grad_norm": 1.1423624753952026, | |
| "learning_rate": 5.710139192768696e-05, | |
| "loss": 0.5831, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.22161387042106637, | |
| "grad_norm": 1.2858572006225586, | |
| "learning_rate": 5.6646857029613434e-05, | |
| "loss": 0.8175, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2221353148220571, | |
| "grad_norm": 1.2642686367034912, | |
| "learning_rate": 5.6191044922418485e-05, | |
| "loss": 0.6987, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.22265675922304784, | |
| "grad_norm": 1.0850262641906738, | |
| "learning_rate": 5.5734005591036144e-05, | |
| "loss": 0.7008, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.2231782036240386, | |
| "grad_norm": 1.059691309928894, | |
| "learning_rate": 5.527578915497952e-05, | |
| "loss": 0.6485, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.22369964802502934, | |
| "grad_norm": 1.1776199340820312, | |
| "learning_rate": 5.4816445862844426e-05, | |
| "loss": 0.7816, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.22422109242602006, | |
| "grad_norm": 1.065766453742981, | |
| "learning_rate": 5.435602608679916e-05, | |
| "loss": 0.6872, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2247425368270108, | |
| "grad_norm": 1.419598937034607, | |
| "learning_rate": 5.3894580317060684e-05, | |
| "loss": 0.7295, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.22526398122800156, | |
| "grad_norm": 1.0950857400894165, | |
| "learning_rate": 5.343215915635762e-05, | |
| "loss": 0.7141, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.22578542562899231, | |
| "grad_norm": 1.2764692306518555, | |
| "learning_rate": 5.2968813314381255e-05, | |
| "loss": 0.7478, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.22630687002998306, | |
| "grad_norm": 1.3446756601333618, | |
| "learning_rate": 5.250459360222461e-05, | |
| "loss": 0.7216, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.2268283144309738, | |
| "grad_norm": 1.3674976825714111, | |
| "learning_rate": 5.20395509268104e-05, | |
| "loss": 0.7942, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.22734975883196454, | |
| "grad_norm": 1.1008920669555664, | |
| "learning_rate": 5.157373628530853e-05, | |
| "loss": 0.6696, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.2278712032329553, | |
| "grad_norm": 1.119428277015686, | |
| "learning_rate": 5.1107200759543704e-05, | |
| "loss": 0.6747, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.22839264763394604, | |
| "grad_norm": 1.2905117273330688, | |
| "learning_rate": 5.06399955103937e-05, | |
| "loss": 0.7237, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.22891409203493676, | |
| "grad_norm": 1.1231979131698608, | |
| "learning_rate": 5.017217177217901e-05, | |
| "loss": 0.6448, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.2294355364359275, | |
| "grad_norm": 1.430540680885315, | |
| "learning_rate": 4.9703780847044415e-05, | |
| "loss": 0.9266, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.22995698083691826, | |
| "grad_norm": 1.6048601865768433, | |
| "learning_rate": 4.923487409933316e-05, | |
| "loss": 0.9436, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.230478425237909, | |
| "grad_norm": 1.4154115915298462, | |
| "learning_rate": 4.876550294995421e-05, | |
| "loss": 0.8578, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.23099986963889976, | |
| "grad_norm": 1.4124436378479004, | |
| "learning_rate": 4.829571887074343e-05, | |
| "loss": 0.811, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.23152131403989049, | |
| "grad_norm": 1.3669768571853638, | |
| "learning_rate": 4.782557337881911e-05, | |
| "loss": 0.8344, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.23204275844088124, | |
| "grad_norm": 1.5078638792037964, | |
| "learning_rate": 4.7355118030932484e-05, | |
| "loss": 0.7743, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.232564202841872, | |
| "grad_norm": 1.2406189441680908, | |
| "learning_rate": 4.688440441781398e-05, | |
| "loss": 0.7794, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.23308564724286274, | |
| "grad_norm": 1.4059579372406006, | |
| "learning_rate": 4.6413484158515774e-05, | |
| "loss": 0.9038, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.2336070916438535, | |
| "grad_norm": 1.6239300966262817, | |
| "learning_rate": 4.594240889475107e-05, | |
| "loss": 0.8264, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.2341285360448442, | |
| "grad_norm": 1.3349806070327759, | |
| "learning_rate": 4.547123028523106e-05, | |
| "loss": 0.784, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.23464998044583496, | |
| "grad_norm": 1.809417963027954, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.9593, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.23464998044583496, | |
| "eval_loss": 0.6834670901298523, | |
| "eval_runtime": 326.3076, | |
| "eval_samples_per_second": 19.797, | |
| "eval_steps_per_second": 4.949, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2351714248468257, | |
| "grad_norm": 0.8945098519325256, | |
| "learning_rate": 4.452876971476896e-05, | |
| "loss": 0.327, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.23569286924781646, | |
| "grad_norm": 0.9733836054801941, | |
| "learning_rate": 4.4057591105248945e-05, | |
| "loss": 0.3517, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.23621431364880718, | |
| "grad_norm": 0.44081079959869385, | |
| "learning_rate": 4.358651584148423e-05, | |
| "loss": 0.2293, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.23673575804979793, | |
| "grad_norm": 0.420837938785553, | |
| "learning_rate": 4.311559558218603e-05, | |
| "loss": 0.2186, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.23725720245078868, | |
| "grad_norm": 0.7009119391441345, | |
| "learning_rate": 4.264488196906753e-05, | |
| "loss": 0.3505, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.23777864685177943, | |
| "grad_norm": 0.7264213562011719, | |
| "learning_rate": 4.21744266211809e-05, | |
| "loss": 0.3351, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.23830009125277019, | |
| "grad_norm": 0.7642529606819153, | |
| "learning_rate": 4.1704281129256585e-05, | |
| "loss": 0.4276, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.2388215356537609, | |
| "grad_norm": 0.9208986759185791, | |
| "learning_rate": 4.1234497050045815e-05, | |
| "loss": 0.426, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.23934298005475166, | |
| "grad_norm": 1.1265970468521118, | |
| "learning_rate": 4.076512590066686e-05, | |
| "loss": 0.6691, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.2398644244557424, | |
| "grad_norm": 0.976740300655365, | |
| "learning_rate": 4.0296219152955604e-05, | |
| "loss": 0.6463, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.24038586885673316, | |
| "grad_norm": 0.9354336261749268, | |
| "learning_rate": 3.982782822782101e-05, | |
| "loss": 0.5267, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.24090731325772388, | |
| "grad_norm": 0.9918802380561829, | |
| "learning_rate": 3.936000448960631e-05, | |
| "loss": 0.5501, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.24142875765871463, | |
| "grad_norm": 1.246860384941101, | |
| "learning_rate": 3.889279924045631e-05, | |
| "loss": 0.6748, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.24195020205970538, | |
| "grad_norm": 0.8496841788291931, | |
| "learning_rate": 3.842626371469148e-05, | |
| "loss": 0.538, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.24247164646069613, | |
| "grad_norm": 1.2216079235076904, | |
| "learning_rate": 3.796044907318961e-05, | |
| "loss": 0.7916, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.24299309086168688, | |
| "grad_norm": 1.1502059698104858, | |
| "learning_rate": 3.74954063977754e-05, | |
| "loss": 0.5625, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.2435145352626776, | |
| "grad_norm": 1.0070570707321167, | |
| "learning_rate": 3.703118668561876e-05, | |
| "loss": 0.623, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.24403597966366836, | |
| "grad_norm": 1.0661598443984985, | |
| "learning_rate": 3.6567840843642385e-05, | |
| "loss": 0.7055, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.2445574240646591, | |
| "grad_norm": 1.037097692489624, | |
| "learning_rate": 3.610541968293932e-05, | |
| "loss": 0.6177, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.24507886846564986, | |
| "grad_norm": 1.8507579565048218, | |
| "learning_rate": 3.564397391320084e-05, | |
| "loss": 0.7, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2456003128666406, | |
| "grad_norm": 1.1853182315826416, | |
| "learning_rate": 3.51835541371556e-05, | |
| "loss": 0.6217, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.24612175726763133, | |
| "grad_norm": 0.8545302152633667, | |
| "learning_rate": 3.472421084502049e-05, | |
| "loss": 0.5726, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.24664320166862208, | |
| "grad_norm": 1.1831412315368652, | |
| "learning_rate": 3.426599440896387e-05, | |
| "loss": 0.6007, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.24716464606961283, | |
| "grad_norm": 1.1138157844543457, | |
| "learning_rate": 3.380895507758154e-05, | |
| "loss": 0.6453, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.24768609047060358, | |
| "grad_norm": 1.097508192062378, | |
| "learning_rate": 3.3353142970386565e-05, | |
| "loss": 0.6088, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.2482075348715943, | |
| "grad_norm": 1.123647928237915, | |
| "learning_rate": 3.2898608072313045e-05, | |
| "loss": 0.7489, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.24872897927258505, | |
| "grad_norm": 1.0547268390655518, | |
| "learning_rate": 3.244540022823469e-05, | |
| "loss": 0.6683, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.2492504236735758, | |
| "grad_norm": 0.9712570905685425, | |
| "learning_rate": 3.199356913749877e-05, | |
| "loss": 0.5591, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.24977186807456656, | |
| "grad_norm": 1.165372371673584, | |
| "learning_rate": 3.1543164348476105e-05, | |
| "loss": 0.6808, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.2502933124755573, | |
| "grad_norm": 1.4066596031188965, | |
| "learning_rate": 3.1094235253127374e-05, | |
| "loss": 0.7196, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.25081475687654803, | |
| "grad_norm": 1.2690683603286743, | |
| "learning_rate": 3.064683108158685e-05, | |
| "loss": 0.7631, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.2513362012775388, | |
| "grad_norm": 1.2126400470733643, | |
| "learning_rate": 3.0201000896763757e-05, | |
| "loss": 0.6118, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.25185764567852953, | |
| "grad_norm": 1.2217490673065186, | |
| "learning_rate": 2.975679358896189e-05, | |
| "loss": 0.8022, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.25237909007952025, | |
| "grad_norm": 1.4065697193145752, | |
| "learning_rate": 2.9314257870518325e-05, | |
| "loss": 0.8177, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.25290053448051103, | |
| "grad_norm": 1.394194483757019, | |
| "learning_rate": 2.887344227046149e-05, | |
| "loss": 0.6846, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.25342197888150175, | |
| "grad_norm": 1.2853827476501465, | |
| "learning_rate": 2.8434395129189495e-05, | |
| "loss": 0.8623, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.25394342328249253, | |
| "grad_norm": 1.3407214879989624, | |
| "learning_rate": 2.7997164593168986e-05, | |
| "loss": 0.8026, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.25446486768348325, | |
| "grad_norm": 0.9608036875724792, | |
| "learning_rate": 2.756179860965537e-05, | |
| "loss": 0.5896, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.254986312084474, | |
| "grad_norm": 1.2732912302017212, | |
| "learning_rate": 2.7128344921434877e-05, | |
| "loss": 0.882, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.25550775648546475, | |
| "grad_norm": 1.3587908744812012, | |
| "learning_rate": 2.6696851061589e-05, | |
| "loss": 0.7432, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2560292008864555, | |
| "grad_norm": 1.1746113300323486, | |
| "learning_rate": 2.6267364348281954e-05, | |
| "loss": 0.7805, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.2565506452874462, | |
| "grad_norm": 1.1895116567611694, | |
| "learning_rate": 2.5839931879571733e-05, | |
| "loss": 0.8167, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.257072089688437, | |
| "grad_norm": 1.246069312095642, | |
| "learning_rate": 2.541460052824527e-05, | |
| "loss": 0.7614, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.2575935340894277, | |
| "grad_norm": 1.507230281829834, | |
| "learning_rate": 2.4991416936678276e-05, | |
| "loss": 0.7661, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.2581149784904185, | |
| "grad_norm": 1.2582144737243652, | |
| "learning_rate": 2.4570427511720398e-05, | |
| "loss": 0.7222, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.2586364228914092, | |
| "grad_norm": 1.2553263902664185, | |
| "learning_rate": 2.4151678419606235e-05, | |
| "loss": 0.8181, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.2591578672923999, | |
| "grad_norm": 1.2473095655441284, | |
| "learning_rate": 2.3735215580892577e-05, | |
| "loss": 0.7124, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.2596793116933907, | |
| "grad_norm": 1.3642276525497437, | |
| "learning_rate": 2.3321084665422807e-05, | |
| "loss": 0.8353, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.2602007560943814, | |
| "grad_norm": 1.269373893737793, | |
| "learning_rate": 2.2909331087318664e-05, | |
| "loss": 0.73, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.2607222004953722, | |
| "grad_norm": 1.5897523164749146, | |
| "learning_rate": 2.250000000000001e-05, | |
| "loss": 0.8818, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2612436448963629, | |
| "grad_norm": 0.7522194981575012, | |
| "learning_rate": 2.209313629123329e-05, | |
| "loss": 0.3334, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.26176508929735365, | |
| "grad_norm": 1.1364073753356934, | |
| "learning_rate": 2.168878457820915e-05, | |
| "loss": 0.4722, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.2622865336983444, | |
| "grad_norm": 0.45651566982269287, | |
| "learning_rate": 2.128698920264951e-05, | |
| "loss": 0.2023, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.26280797809933515, | |
| "grad_norm": 0.407569944858551, | |
| "learning_rate": 2.088779422594514e-05, | |
| "loss": 0.2021, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.2633294225003259, | |
| "grad_norm": 0.6987316608428955, | |
| "learning_rate": 2.0491243424323783e-05, | |
| "loss": 0.2595, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.26385086690131665, | |
| "grad_norm": 0.741063117980957, | |
| "learning_rate": 2.009738028404952e-05, | |
| "loss": 0.3919, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.2643723113023074, | |
| "grad_norm": 0.7104949951171875, | |
| "learning_rate": 1.9706247996654134e-05, | |
| "loss": 0.3903, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.26489375570329815, | |
| "grad_norm": 1.1556988954544067, | |
| "learning_rate": 1.9317889454200578e-05, | |
| "loss": 0.538, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.2654152001042889, | |
| "grad_norm": 0.873782753944397, | |
| "learning_rate": 1.8932347244579463e-05, | |
| "loss": 0.5209, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.26593664450527965, | |
| "grad_norm": 0.6760383248329163, | |
| "learning_rate": 1.8549663646838714e-05, | |
| "loss": 0.4616, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2664580889062704, | |
| "grad_norm": 0.7786940336227417, | |
| "learning_rate": 1.8169880626547285e-05, | |
| "loss": 0.4068, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.2669795333072611, | |
| "grad_norm": 0.9264464378356934, | |
| "learning_rate": 1.7793039831193134e-05, | |
| "loss": 0.599, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.2675009777082519, | |
| "grad_norm": 0.9444701671600342, | |
| "learning_rate": 1.741918258561607e-05, | |
| "loss": 0.6268, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.2680224221092426, | |
| "grad_norm": 1.0351696014404297, | |
| "learning_rate": 1.7048349887476038e-05, | |
| "loss": 0.7387, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.2685438665102333, | |
| "grad_norm": 1.108831524848938, | |
| "learning_rate": 1.6680582402757324e-05, | |
| "loss": 0.597, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.2690653109112241, | |
| "grad_norm": 0.933988094329834, | |
| "learning_rate": 1.631592046130896e-05, | |
| "loss": 0.6301, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.2695867553122148, | |
| "grad_norm": 0.9654362797737122, | |
| "learning_rate": 1.5954404052422217e-05, | |
| "loss": 0.52, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.2701081997132056, | |
| "grad_norm": 1.1467019319534302, | |
| "learning_rate": 1.5596072820445255e-05, | |
| "loss": 0.7317, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.2706296441141963, | |
| "grad_norm": 1.1809656620025635, | |
| "learning_rate": 1.5240966060435674e-05, | |
| "loss": 0.6836, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.27115108851518704, | |
| "grad_norm": 0.8975329995155334, | |
| "learning_rate": 1.4889122713851395e-05, | |
| "loss": 0.5057, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2716725329161778, | |
| "grad_norm": 0.9241394996643066, | |
| "learning_rate": 1.4540581364280274e-05, | |
| "loss": 0.6661, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.27219397731716855, | |
| "grad_norm": 0.7762001156806946, | |
| "learning_rate": 1.4195380233209009e-05, | |
| "loss": 0.4453, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.2727154217181593, | |
| "grad_norm": 1.1368845701217651, | |
| "learning_rate": 1.38535571758317e-05, | |
| "loss": 0.8007, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.27323686611915005, | |
| "grad_norm": 1.1444828510284424, | |
| "learning_rate": 1.3515149676898551e-05, | |
| "loss": 0.6431, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.27375831052014077, | |
| "grad_norm": 1.3276859521865845, | |
| "learning_rate": 1.3180194846605365e-05, | |
| "loss": 0.7336, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.27427975492113155, | |
| "grad_norm": 0.9438497424125671, | |
| "learning_rate": 1.284872941652386e-05, | |
| "loss": 0.6664, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.27480119932212227, | |
| "grad_norm": 1.2121400833129883, | |
| "learning_rate": 1.2520789735573703e-05, | |
| "loss": 0.7121, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.27532264372311305, | |
| "grad_norm": 1.2498886585235596, | |
| "learning_rate": 1.2196411766036491e-05, | |
| "loss": 0.7712, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.27584408812410377, | |
| "grad_norm": 1.1315795183181763, | |
| "learning_rate": 1.1875631079611956e-05, | |
| "loss": 0.7224, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.2763655325250945, | |
| "grad_norm": 1.4224852323532104, | |
| "learning_rate": 1.1558482853517254e-05, | |
| "loss": 0.6649, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.27688697692608527, | |
| "grad_norm": 1.2838762998580933, | |
| "learning_rate": 1.124500186662932e-05, | |
| "loss": 0.7847, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.277408421327076, | |
| "grad_norm": 1.1303495168685913, | |
| "learning_rate": 1.0935222495670969e-05, | |
| "loss": 0.7652, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.27792986572806677, | |
| "grad_norm": 1.0751543045043945, | |
| "learning_rate": 1.0629178711441115e-05, | |
| "loss": 0.6492, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.2784513101290575, | |
| "grad_norm": 1.2806495428085327, | |
| "learning_rate": 1.032690407508949e-05, | |
| "loss": 0.6801, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.2789727545300482, | |
| "grad_norm": 1.032645583152771, | |
| "learning_rate": 1.002843173443631e-05, | |
| "loss": 0.6324, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.279494198931039, | |
| "grad_norm": 1.1595271825790405, | |
| "learning_rate": 9.733794420337214e-06, | |
| "loss": 0.7248, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.2800156433320297, | |
| "grad_norm": 1.2283949851989746, | |
| "learning_rate": 9.443024443093932e-06, | |
| "loss": 0.6415, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.28053708773302044, | |
| "grad_norm": 1.0101823806762695, | |
| "learning_rate": 9.15615368891117e-06, | |
| "loss": 0.7115, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.2810585321340112, | |
| "grad_norm": 1.1437267065048218, | |
| "learning_rate": 8.873213616399854e-06, | |
| "loss": 0.8146, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.28157997653500194, | |
| "grad_norm": 1.395314335823059, | |
| "learning_rate": 8.59423525312737e-06, | |
| "loss": 0.8216, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2821014209359927, | |
| "grad_norm": 1.1216462850570679, | |
| "learning_rate": 8.319249192215056e-06, | |
| "loss": 0.7296, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.28262286533698344, | |
| "grad_norm": 1.1411585807800293, | |
| "learning_rate": 8.04828558898332e-06, | |
| "loss": 0.7305, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.28314430973797416, | |
| "grad_norm": 1.0510220527648926, | |
| "learning_rate": 7.781374157644714e-06, | |
| "loss": 0.7393, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.28366575413896494, | |
| "grad_norm": 1.3976047039031982, | |
| "learning_rate": 7.518544168045526e-06, | |
| "loss": 0.8331, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.28418719853995567, | |
| "grad_norm": 1.1391674280166626, | |
| "learning_rate": 7.259824442455923e-06, | |
| "loss": 0.7816, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.28470864294094644, | |
| "grad_norm": 1.2341560125350952, | |
| "learning_rate": 7.005243352409332e-06, | |
| "loss": 0.7965, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.28523008734193717, | |
| "grad_norm": 1.2359131574630737, | |
| "learning_rate": 6.754828815591131e-06, | |
| "loss": 0.8758, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.2857515317429279, | |
| "grad_norm": 1.5217698812484741, | |
| "learning_rate": 6.508608292777203e-06, | |
| "loss": 0.9667, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.28627297614391867, | |
| "grad_norm": 1.633954405784607, | |
| "learning_rate": 6.266608784822542e-06, | |
| "loss": 0.7868, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.2867944205449094, | |
| "grad_norm": 1.9118640422821045, | |
| "learning_rate": 6.028856829700258e-06, | |
| "loss": 0.8767, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.28731586494590017, | |
| "grad_norm": 0.5335854291915894, | |
| "learning_rate": 5.795378499591479e-06, | |
| "loss": 0.2677, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.2878373093468909, | |
| "grad_norm": 0.8315818309783936, | |
| "learning_rate": 5.566199398026149e-06, | |
| "loss": 0.3736, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.2883587537478816, | |
| "grad_norm": 0.4964602589607239, | |
| "learning_rate": 5.341344657075353e-06, | |
| "loss": 0.1941, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.2888801981488724, | |
| "grad_norm": 0.4502871632575989, | |
| "learning_rate": 5.120838934595337e-06, | |
| "loss": 0.2058, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.2894016425498631, | |
| "grad_norm": 0.6107041239738464, | |
| "learning_rate": 4.90470641152345e-06, | |
| "loss": 0.2863, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.2899230869508539, | |
| "grad_norm": 0.5411296486854553, | |
| "learning_rate": 4.69297078922642e-06, | |
| "loss": 0.3224, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.2904445313518446, | |
| "grad_norm": 0.5916683673858643, | |
| "learning_rate": 4.485655286901292e-06, | |
| "loss": 0.3624, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.29096597575283534, | |
| "grad_norm": 0.9298704266548157, | |
| "learning_rate": 4.28278263902913e-06, | |
| "loss": 0.5355, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.2914874201538261, | |
| "grad_norm": 1.085946798324585, | |
| "learning_rate": 4.084375092881916e-06, | |
| "loss": 0.5446, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.29200886455481684, | |
| "grad_norm": 0.9874389171600342, | |
| "learning_rate": 3.890454406082956e-06, | |
| "loss": 0.6942, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.29253030895580756, | |
| "grad_norm": 0.7588855028152466, | |
| "learning_rate": 3.701041844220849e-06, | |
| "loss": 0.5185, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.29305175335679834, | |
| "grad_norm": 0.7749528884887695, | |
| "learning_rate": 3.516158178517482e-06, | |
| "loss": 0.4994, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.29357319775778906, | |
| "grad_norm": 0.9377657175064087, | |
| "learning_rate": 3.335823683550237e-06, | |
| "loss": 0.5773, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.29409464215877984, | |
| "grad_norm": 0.9080403447151184, | |
| "learning_rate": 3.1600581350286897e-06, | |
| "loss": 0.5582, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.29461608655977056, | |
| "grad_norm": 0.9245966672897339, | |
| "learning_rate": 2.9888808076259267e-06, | |
| "loss": 0.6085, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.2951375309607613, | |
| "grad_norm": 1.0248568058013916, | |
| "learning_rate": 2.822310472864885e-06, | |
| "loss": 0.6125, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.29565897536175206, | |
| "grad_norm": 1.0454648733139038, | |
| "learning_rate": 2.660365397059855e-06, | |
| "loss": 0.6444, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.2961804197627428, | |
| "grad_norm": 0.9811504483222961, | |
| "learning_rate": 2.503063339313355e-06, | |
| "loss": 0.647, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.29670186416373356, | |
| "grad_norm": 1.1872072219848633, | |
| "learning_rate": 2.3504215495686498e-06, | |
| "loss": 0.7537, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.2972233085647243, | |
| "grad_norm": 0.9496892094612122, | |
| "learning_rate": 2.2024567667180914e-06, | |
| "loss": 0.6789, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.297744752965715, | |
| "grad_norm": 1.110507607460022, | |
| "learning_rate": 2.059185216767543e-06, | |
| "loss": 0.64, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.2982661973667058, | |
| "grad_norm": 0.9525758028030396, | |
| "learning_rate": 1.9206226110569742e-06, | |
| "loss": 0.5955, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.2987876417676965, | |
| "grad_norm": 1.0465561151504517, | |
| "learning_rate": 1.7867841445375621e-06, | |
| "loss": 0.6887, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.2993090861686873, | |
| "grad_norm": 1.0657908916473389, | |
| "learning_rate": 1.6576844941053854e-06, | |
| "loss": 0.7477, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.299830530569678, | |
| "grad_norm": 1.0674349069595337, | |
| "learning_rate": 1.533337816991931e-06, | |
| "loss": 0.7795, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.30035197497066873, | |
| "grad_norm": 1.0765479803085327, | |
| "learning_rate": 1.4137577492116016e-06, | |
| "loss": 0.7111, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.3008734193716595, | |
| "grad_norm": 1.0293859243392944, | |
| "learning_rate": 1.2989574040663816e-06, | |
| "loss": 0.6233, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.30139486377265023, | |
| "grad_norm": 1.1408146619796753, | |
| "learning_rate": 1.188949370707787e-06, | |
| "loss": 0.6778, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.301916308173641, | |
| "grad_norm": 1.2201671600341797, | |
| "learning_rate": 1.0837457127563656e-06, | |
| "loss": 0.7382, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.30243775257463174, | |
| "grad_norm": 1.1659302711486816, | |
| "learning_rate": 9.83357966978744e-07, | |
| "loss": 0.7453, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.30295919697562246, | |
| "grad_norm": 1.2378828525543213, | |
| "learning_rate": 8.877971420225212e-07, | |
| "loss": 0.8273, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.30348064137661324, | |
| "grad_norm": 1.0689337253570557, | |
| "learning_rate": 7.970737172090126e-07, | |
| "loss": 0.6794, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.30400208577760396, | |
| "grad_norm": 1.2321866750717163, | |
| "learning_rate": 7.111976413841153e-07, | |
| "loss": 0.7465, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.3045235301785947, | |
| "grad_norm": 1.0835295915603638, | |
| "learning_rate": 6.301783318272809e-07, | |
| "loss": 0.6639, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.30504497457958546, | |
| "grad_norm": 1.1415941715240479, | |
| "learning_rate": 5.540246732188054e-07, | |
| "loss": 0.626, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.3055664189805762, | |
| "grad_norm": 1.1108524799346924, | |
| "learning_rate": 4.827450166655251e-07, | |
| "loss": 0.7758, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.30608786338156696, | |
| "grad_norm": 1.0960720777511597, | |
| "learning_rate": 4.1634717878503816e-07, | |
| "loss": 0.7627, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.3066093077825577, | |
| "grad_norm": 1.2436336278915405, | |
| "learning_rate": 3.548384408485006e-07, | |
| "loss": 0.7669, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.3071307521835484, | |
| "grad_norm": 1.194196343421936, | |
| "learning_rate": 2.9822554798215994e-07, | |
| "loss": 0.7492, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.3076521965845392, | |
| "grad_norm": 1.3664387464523315, | |
| "learning_rate": 2.4651470842770196e-07, | |
| "loss": 0.7941, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3081736409855299, | |
| "grad_norm": 1.2188045978546143, | |
| "learning_rate": 1.9971159286140017e-07, | |
| "loss": 0.7608, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.3086950853865207, | |
| "grad_norm": 1.096019983291626, | |
| "learning_rate": 1.5782133377230334e-07, | |
| "loss": 0.655, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.3092165297875114, | |
| "grad_norm": 1.2465800046920776, | |
| "learning_rate": 1.208485248993857e-07, | |
| "loss": 0.7535, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.30973797418850213, | |
| "grad_norm": 1.3181999921798706, | |
| "learning_rate": 8.879722072777986e-08, | |
| "loss": 0.8335, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.3102594185894929, | |
| "grad_norm": 1.1617658138275146, | |
| "learning_rate": 6.167093604417751e-08, | |
| "loss": 0.6741, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.31078086299048363, | |
| "grad_norm": 1.2442768812179565, | |
| "learning_rate": 3.9472645551372757e-08, | |
| "loss": 0.777, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.3113023073914744, | |
| "grad_norm": 2.248586416244507, | |
| "learning_rate": 2.2204783542078e-08, | |
| "loss": 0.6729, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.31182375179246513, | |
| "grad_norm": 1.264545202255249, | |
| "learning_rate": 9.869243631952518e-09, | |
| "loss": 0.7514, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.31234519619345585, | |
| "grad_norm": 1.1710125207901, | |
| "learning_rate": 2.467378551953559e-09, | |
| "loss": 0.7467, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.31286664059444663, | |
| "grad_norm": 1.261979103088379, | |
| "learning_rate": 0.0, | |
| "loss": 0.7243, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.31286664059444663, | |
| "eval_loss": 0.6509745717048645, | |
| "eval_runtime": 326.4655, | |
| "eval_samples_per_second": 19.788, | |
| "eval_steps_per_second": 4.947, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 4, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8826249462748283e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |