cilorku's picture
Training in progress, step 600, checkpoint
42071db verified
{
"best_metric": 0.6509745717048645,
"best_model_checkpoint": "miner_id_24/checkpoint-600",
"epoch": 0.31286664059444663,
"eval_steps": 150,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005214444009907443,
"grad_norm": 2.7253143787384033,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.1883,
"step": 1
},
{
"epoch": 0.0005214444009907443,
"eval_loss": 3.5160233974456787,
"eval_runtime": 325.6176,
"eval_samples_per_second": 19.839,
"eval_steps_per_second": 4.96,
"step": 1
},
{
"epoch": 0.0010428888019814887,
"grad_norm": 3.386953115463257,
"learning_rate": 6.000000000000001e-07,
"loss": 1.6895,
"step": 2
},
{
"epoch": 0.001564333202972233,
"grad_norm": 3.0215859413146973,
"learning_rate": 9.000000000000001e-07,
"loss": 1.221,
"step": 3
},
{
"epoch": 0.0020857776039629773,
"grad_norm": 3.2928130626678467,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.1251,
"step": 4
},
{
"epoch": 0.0026072220049537216,
"grad_norm": 3.2181572914123535,
"learning_rate": 1.5e-06,
"loss": 1.3883,
"step": 5
},
{
"epoch": 0.003128666405944466,
"grad_norm": 3.965681552886963,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.9077,
"step": 6
},
{
"epoch": 0.0036501108069352107,
"grad_norm": 3.617943048477173,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.883,
"step": 7
},
{
"epoch": 0.004171555207925955,
"grad_norm": 3.563110113143921,
"learning_rate": 2.4000000000000003e-06,
"loss": 2.3753,
"step": 8
},
{
"epoch": 0.004692999608916699,
"grad_norm": 4.999038219451904,
"learning_rate": 2.7e-06,
"loss": 2.9592,
"step": 9
},
{
"epoch": 0.005214444009907443,
"grad_norm": 4.850180625915527,
"learning_rate": 3e-06,
"loss": 3.0989,
"step": 10
},
{
"epoch": 0.005735888410898188,
"grad_norm": 4.822148323059082,
"learning_rate": 3.3e-06,
"loss": 2.5808,
"step": 11
},
{
"epoch": 0.006257332811888932,
"grad_norm": 4.977088451385498,
"learning_rate": 3.6000000000000003e-06,
"loss": 3.3742,
"step": 12
},
{
"epoch": 0.006778777212879676,
"grad_norm": 4.140382766723633,
"learning_rate": 3.900000000000001e-06,
"loss": 2.5006,
"step": 13
},
{
"epoch": 0.0073002216138704215,
"grad_norm": 5.024703502655029,
"learning_rate": 4.2000000000000004e-06,
"loss": 2.7223,
"step": 14
},
{
"epoch": 0.007821666014861166,
"grad_norm": 4.6218976974487305,
"learning_rate": 4.5e-06,
"loss": 2.9669,
"step": 15
},
{
"epoch": 0.00834311041585191,
"grad_norm": 4.481057643890381,
"learning_rate": 4.800000000000001e-06,
"loss": 2.8808,
"step": 16
},
{
"epoch": 0.008864554816842654,
"grad_norm": 5.046372413635254,
"learning_rate": 5.1e-06,
"loss": 3.3685,
"step": 17
},
{
"epoch": 0.009385999217833398,
"grad_norm": 5.197815895080566,
"learning_rate": 5.4e-06,
"loss": 3.3641,
"step": 18
},
{
"epoch": 0.009907443618824143,
"grad_norm": 5.589998245239258,
"learning_rate": 5.7000000000000005e-06,
"loss": 3.2387,
"step": 19
},
{
"epoch": 0.010428888019814887,
"grad_norm": 4.643375873565674,
"learning_rate": 6e-06,
"loss": 2.847,
"step": 20
},
{
"epoch": 0.010950332420805632,
"grad_norm": 5.727187156677246,
"learning_rate": 6.300000000000001e-06,
"loss": 3.5249,
"step": 21
},
{
"epoch": 0.011471776821796375,
"grad_norm": 5.25192928314209,
"learning_rate": 6.6e-06,
"loss": 3.4112,
"step": 22
},
{
"epoch": 0.01199322122278712,
"grad_norm": 5.9937238693237305,
"learning_rate": 6.9e-06,
"loss": 3.3331,
"step": 23
},
{
"epoch": 0.012514665623777864,
"grad_norm": 5.3217244148254395,
"learning_rate": 7.2000000000000005e-06,
"loss": 3.3351,
"step": 24
},
{
"epoch": 0.013036110024768609,
"grad_norm": 5.732251167297363,
"learning_rate": 7.5e-06,
"loss": 3.4039,
"step": 25
},
{
"epoch": 0.013557554425759353,
"grad_norm": 5.008046627044678,
"learning_rate": 7.800000000000002e-06,
"loss": 2.9577,
"step": 26
},
{
"epoch": 0.014078998826750098,
"grad_norm": 5.3130106925964355,
"learning_rate": 8.1e-06,
"loss": 3.1343,
"step": 27
},
{
"epoch": 0.014600443227740843,
"grad_norm": 4.684622764587402,
"learning_rate": 8.400000000000001e-06,
"loss": 3.0004,
"step": 28
},
{
"epoch": 0.015121887628731586,
"grad_norm": 4.972151279449463,
"learning_rate": 8.7e-06,
"loss": 2.9132,
"step": 29
},
{
"epoch": 0.01564333202972233,
"grad_norm": 5.852766036987305,
"learning_rate": 9e-06,
"loss": 3.3844,
"step": 30
},
{
"epoch": 0.016164776430713075,
"grad_norm": 5.366592884063721,
"learning_rate": 9.3e-06,
"loss": 2.9811,
"step": 31
},
{
"epoch": 0.01668622083170382,
"grad_norm": 4.914961338043213,
"learning_rate": 9.600000000000001e-06,
"loss": 2.819,
"step": 32
},
{
"epoch": 0.017207665232694565,
"grad_norm": 4.8333210945129395,
"learning_rate": 9.9e-06,
"loss": 2.8641,
"step": 33
},
{
"epoch": 0.01772910963368531,
"grad_norm": 5.034109115600586,
"learning_rate": 1.02e-05,
"loss": 2.8783,
"step": 34
},
{
"epoch": 0.018250554034676052,
"grad_norm": 4.528682231903076,
"learning_rate": 1.0500000000000001e-05,
"loss": 2.6515,
"step": 35
},
{
"epoch": 0.018771998435666796,
"grad_norm": 4.572699069976807,
"learning_rate": 1.08e-05,
"loss": 2.6785,
"step": 36
},
{
"epoch": 0.019293442836657543,
"grad_norm": 6.095496654510498,
"learning_rate": 1.11e-05,
"loss": 2.5944,
"step": 37
},
{
"epoch": 0.019814887237648286,
"grad_norm": 4.699917316436768,
"learning_rate": 1.1400000000000001e-05,
"loss": 2.4965,
"step": 38
},
{
"epoch": 0.02033633163863903,
"grad_norm": 4.598426818847656,
"learning_rate": 1.1700000000000001e-05,
"loss": 2.1656,
"step": 39
},
{
"epoch": 0.020857776039629773,
"grad_norm": 4.476649284362793,
"learning_rate": 1.2e-05,
"loss": 2.0975,
"step": 40
},
{
"epoch": 0.02137922044062052,
"grad_norm": 5.0952653884887695,
"learning_rate": 1.23e-05,
"loss": 2.349,
"step": 41
},
{
"epoch": 0.021900664841611264,
"grad_norm": 4.58736515045166,
"learning_rate": 1.2600000000000001e-05,
"loss": 2.0085,
"step": 42
},
{
"epoch": 0.022422109242602007,
"grad_norm": 5.128681659698486,
"learning_rate": 1.2900000000000002e-05,
"loss": 2.0787,
"step": 43
},
{
"epoch": 0.02294355364359275,
"grad_norm": 4.277550220489502,
"learning_rate": 1.32e-05,
"loss": 1.9801,
"step": 44
},
{
"epoch": 0.023464998044583497,
"grad_norm": 4.380857944488525,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.8278,
"step": 45
},
{
"epoch": 0.02398644244557424,
"grad_norm": 4.337310314178467,
"learning_rate": 1.38e-05,
"loss": 1.8447,
"step": 46
},
{
"epoch": 0.024507886846564984,
"grad_norm": 4.444604396820068,
"learning_rate": 1.4100000000000002e-05,
"loss": 1.7982,
"step": 47
},
{
"epoch": 0.025029331247555728,
"grad_norm": 3.821458339691162,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.5049,
"step": 48
},
{
"epoch": 0.025550775648546475,
"grad_norm": 4.789224624633789,
"learning_rate": 1.47e-05,
"loss": 1.7945,
"step": 49
},
{
"epoch": 0.026072220049537218,
"grad_norm": 4.952455520629883,
"learning_rate": 1.5e-05,
"loss": 1.9581,
"step": 50
},
{
"epoch": 0.02659366445052796,
"grad_norm": 1.6376649141311646,
"learning_rate": 1.5300000000000003e-05,
"loss": 0.6677,
"step": 51
},
{
"epoch": 0.027115108851518705,
"grad_norm": 1.824238657951355,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.7885,
"step": 52
},
{
"epoch": 0.027636553252509452,
"grad_norm": 1.5552136898040771,
"learning_rate": 1.59e-05,
"loss": 0.4647,
"step": 53
},
{
"epoch": 0.028157997653500196,
"grad_norm": 1.6044234037399292,
"learning_rate": 1.62e-05,
"loss": 0.4944,
"step": 54
},
{
"epoch": 0.02867944205449094,
"grad_norm": 1.4595534801483154,
"learning_rate": 1.65e-05,
"loss": 0.5307,
"step": 55
},
{
"epoch": 0.029200886455481686,
"grad_norm": 1.7148545980453491,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.7493,
"step": 56
},
{
"epoch": 0.02972233085647243,
"grad_norm": 1.7278859615325928,
"learning_rate": 1.7100000000000002e-05,
"loss": 0.7113,
"step": 57
},
{
"epoch": 0.030243775257463173,
"grad_norm": 2.3812315464019775,
"learning_rate": 1.74e-05,
"loss": 0.9793,
"step": 58
},
{
"epoch": 0.030765219658453916,
"grad_norm": 2.3127753734588623,
"learning_rate": 1.77e-05,
"loss": 1.1053,
"step": 59
},
{
"epoch": 0.03128666405944466,
"grad_norm": 1.7662042379379272,
"learning_rate": 1.8e-05,
"loss": 0.7896,
"step": 60
},
{
"epoch": 0.03180810846043541,
"grad_norm": 2.224780797958374,
"learning_rate": 1.83e-05,
"loss": 0.9967,
"step": 61
},
{
"epoch": 0.03232955286142615,
"grad_norm": 2.4859564304351807,
"learning_rate": 1.86e-05,
"loss": 1.0838,
"step": 62
},
{
"epoch": 0.032850997262416894,
"grad_norm": 1.620069146156311,
"learning_rate": 1.8900000000000002e-05,
"loss": 0.7138,
"step": 63
},
{
"epoch": 0.03337244166340764,
"grad_norm": 2.2818684577941895,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.9581,
"step": 64
},
{
"epoch": 0.03389388606439838,
"grad_norm": 1.9163058996200562,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.916,
"step": 65
},
{
"epoch": 0.03441533046538913,
"grad_norm": 1.9157963991165161,
"learning_rate": 1.98e-05,
"loss": 0.8967,
"step": 66
},
{
"epoch": 0.034936774866379874,
"grad_norm": 2.1430623531341553,
"learning_rate": 2.01e-05,
"loss": 1.025,
"step": 67
},
{
"epoch": 0.03545821926737062,
"grad_norm": 2.3148794174194336,
"learning_rate": 2.04e-05,
"loss": 1.0084,
"step": 68
},
{
"epoch": 0.03597966366836136,
"grad_norm": 1.9496533870697021,
"learning_rate": 2.0700000000000002e-05,
"loss": 0.9664,
"step": 69
},
{
"epoch": 0.036501108069352105,
"grad_norm": 2.164203405380249,
"learning_rate": 2.1000000000000002e-05,
"loss": 1.0243,
"step": 70
},
{
"epoch": 0.03702255247034285,
"grad_norm": 2.3370673656463623,
"learning_rate": 2.1300000000000003e-05,
"loss": 0.9201,
"step": 71
},
{
"epoch": 0.03754399687133359,
"grad_norm": 2.2651474475860596,
"learning_rate": 2.16e-05,
"loss": 1.0019,
"step": 72
},
{
"epoch": 0.038065441272324335,
"grad_norm": 2.0069406032562256,
"learning_rate": 2.1900000000000004e-05,
"loss": 0.8822,
"step": 73
},
{
"epoch": 0.038586885673315086,
"grad_norm": 1.717893123626709,
"learning_rate": 2.22e-05,
"loss": 0.8106,
"step": 74
},
{
"epoch": 0.03910833007430583,
"grad_norm": 1.975976586341858,
"learning_rate": 2.25e-05,
"loss": 0.8286,
"step": 75
},
{
"epoch": 0.03962977447529657,
"grad_norm": 1.7399530410766602,
"learning_rate": 2.2800000000000002e-05,
"loss": 0.7989,
"step": 76
},
{
"epoch": 0.040151218876287316,
"grad_norm": 1.8633719682693481,
"learning_rate": 2.31e-05,
"loss": 0.8008,
"step": 77
},
{
"epoch": 0.04067266327727806,
"grad_norm": 1.90866219997406,
"learning_rate": 2.3400000000000003e-05,
"loss": 0.9197,
"step": 78
},
{
"epoch": 0.0411941076782688,
"grad_norm": 2.1896774768829346,
"learning_rate": 2.37e-05,
"loss": 1.1467,
"step": 79
},
{
"epoch": 0.041715552079259546,
"grad_norm": 1.8817816972732544,
"learning_rate": 2.4e-05,
"loss": 0.8939,
"step": 80
},
{
"epoch": 0.0422369964802503,
"grad_norm": 2.845698118209839,
"learning_rate": 2.4300000000000005e-05,
"loss": 1.0723,
"step": 81
},
{
"epoch": 0.04275844088124104,
"grad_norm": 2.0808112621307373,
"learning_rate": 2.46e-05,
"loss": 0.8704,
"step": 82
},
{
"epoch": 0.043279885282231784,
"grad_norm": 1.8603535890579224,
"learning_rate": 2.4900000000000002e-05,
"loss": 0.9147,
"step": 83
},
{
"epoch": 0.04380132968322253,
"grad_norm": 2.1939947605133057,
"learning_rate": 2.5200000000000003e-05,
"loss": 0.9729,
"step": 84
},
{
"epoch": 0.04432277408421327,
"grad_norm": 2.237332344055176,
"learning_rate": 2.55e-05,
"loss": 0.8685,
"step": 85
},
{
"epoch": 0.044844218485204014,
"grad_norm": 1.8912482261657715,
"learning_rate": 2.5800000000000004e-05,
"loss": 0.9319,
"step": 86
},
{
"epoch": 0.04536566288619476,
"grad_norm": 2.4605658054351807,
"learning_rate": 2.61e-05,
"loss": 0.9852,
"step": 87
},
{
"epoch": 0.0458871072871855,
"grad_norm": 2.3050081729888916,
"learning_rate": 2.64e-05,
"loss": 0.9648,
"step": 88
},
{
"epoch": 0.04640855168817625,
"grad_norm": 2.2562782764434814,
"learning_rate": 2.6700000000000005e-05,
"loss": 0.9906,
"step": 89
},
{
"epoch": 0.046929996089166995,
"grad_norm": 2.136019468307495,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.9215,
"step": 90
},
{
"epoch": 0.04745144049015774,
"grad_norm": 2.065573215484619,
"learning_rate": 2.7300000000000003e-05,
"loss": 1.0027,
"step": 91
},
{
"epoch": 0.04797288489114848,
"grad_norm": 2.3429982662200928,
"learning_rate": 2.76e-05,
"loss": 1.0135,
"step": 92
},
{
"epoch": 0.048494329292139225,
"grad_norm": 2.2213521003723145,
"learning_rate": 2.79e-05,
"loss": 0.9772,
"step": 93
},
{
"epoch": 0.04901577369312997,
"grad_norm": 2.0035593509674072,
"learning_rate": 2.8200000000000004e-05,
"loss": 0.8933,
"step": 94
},
{
"epoch": 0.04953721809412071,
"grad_norm": 2.8717164993286133,
"learning_rate": 2.85e-05,
"loss": 1.164,
"step": 95
},
{
"epoch": 0.050058662495111456,
"grad_norm": 2.441497325897217,
"learning_rate": 2.8800000000000002e-05,
"loss": 1.0731,
"step": 96
},
{
"epoch": 0.050580106896102206,
"grad_norm": 2.3798351287841797,
"learning_rate": 2.91e-05,
"loss": 1.0519,
"step": 97
},
{
"epoch": 0.05110155129709295,
"grad_norm": 2.8075406551361084,
"learning_rate": 2.94e-05,
"loss": 1.1309,
"step": 98
},
{
"epoch": 0.05162299569808369,
"grad_norm": 3.4404821395874023,
"learning_rate": 2.9700000000000004e-05,
"loss": 1.1844,
"step": 99
},
{
"epoch": 0.052144440099074436,
"grad_norm": 3.207118272781372,
"learning_rate": 3e-05,
"loss": 1.0556,
"step": 100
},
{
"epoch": 0.05266588450006518,
"grad_norm": 1.896147608757019,
"learning_rate": 3.03e-05,
"loss": 0.5913,
"step": 101
},
{
"epoch": 0.05318732890105592,
"grad_norm": 1.304902195930481,
"learning_rate": 3.0600000000000005e-05,
"loss": 0.4833,
"step": 102
},
{
"epoch": 0.05370877330204667,
"grad_norm": 0.9159135222434998,
"learning_rate": 3.09e-05,
"loss": 0.2707,
"step": 103
},
{
"epoch": 0.05423021770303741,
"grad_norm": 0.8160455822944641,
"learning_rate": 3.1200000000000006e-05,
"loss": 0.252,
"step": 104
},
{
"epoch": 0.05475166210402816,
"grad_norm": 0.8945296406745911,
"learning_rate": 3.15e-05,
"loss": 0.3627,
"step": 105
},
{
"epoch": 0.055273106505018904,
"grad_norm": 1.5228551626205444,
"learning_rate": 3.18e-05,
"loss": 0.5862,
"step": 106
},
{
"epoch": 0.05579455090600965,
"grad_norm": 1.3707317113876343,
"learning_rate": 3.21e-05,
"loss": 0.5722,
"step": 107
},
{
"epoch": 0.05631599530700039,
"grad_norm": 1.5994707345962524,
"learning_rate": 3.24e-05,
"loss": 0.7169,
"step": 108
},
{
"epoch": 0.056837439707991134,
"grad_norm": 1.4748954772949219,
"learning_rate": 3.27e-05,
"loss": 0.6509,
"step": 109
},
{
"epoch": 0.05735888410898188,
"grad_norm": 1.50614595413208,
"learning_rate": 3.3e-05,
"loss": 0.7559,
"step": 110
},
{
"epoch": 0.05788032850997262,
"grad_norm": 1.55193293094635,
"learning_rate": 3.33e-05,
"loss": 0.824,
"step": 111
},
{
"epoch": 0.05840177291096337,
"grad_norm": 1.3997102975845337,
"learning_rate": 3.3600000000000004e-05,
"loss": 0.6819,
"step": 112
},
{
"epoch": 0.058923217311954115,
"grad_norm": 1.7437316179275513,
"learning_rate": 3.39e-05,
"loss": 0.7038,
"step": 113
},
{
"epoch": 0.05944466171294486,
"grad_norm": 1.4148025512695312,
"learning_rate": 3.4200000000000005e-05,
"loss": 0.7393,
"step": 114
},
{
"epoch": 0.0599661061139356,
"grad_norm": 1.8449006080627441,
"learning_rate": 3.4500000000000005e-05,
"loss": 0.834,
"step": 115
},
{
"epoch": 0.060487550514926346,
"grad_norm": 2.5175585746765137,
"learning_rate": 3.48e-05,
"loss": 0.689,
"step": 116
},
{
"epoch": 0.06100899491591709,
"grad_norm": 1.7341601848602295,
"learning_rate": 3.5100000000000006e-05,
"loss": 0.767,
"step": 117
},
{
"epoch": 0.06153043931690783,
"grad_norm": 1.4200981855392456,
"learning_rate": 3.54e-05,
"loss": 0.6371,
"step": 118
},
{
"epoch": 0.062051883717898576,
"grad_norm": 1.6563234329223633,
"learning_rate": 3.57e-05,
"loss": 0.7777,
"step": 119
},
{
"epoch": 0.06257332811888933,
"grad_norm": 1.6099658012390137,
"learning_rate": 3.6e-05,
"loss": 0.6769,
"step": 120
},
{
"epoch": 0.06309477251988006,
"grad_norm": 1.732650637626648,
"learning_rate": 3.63e-05,
"loss": 0.9144,
"step": 121
},
{
"epoch": 0.06361621692087081,
"grad_norm": 1.4773802757263184,
"learning_rate": 3.66e-05,
"loss": 0.6909,
"step": 122
},
{
"epoch": 0.06413766132186155,
"grad_norm": 1.7683815956115723,
"learning_rate": 3.69e-05,
"loss": 0.8182,
"step": 123
},
{
"epoch": 0.0646591057228523,
"grad_norm": 1.900134563446045,
"learning_rate": 3.72e-05,
"loss": 0.834,
"step": 124
},
{
"epoch": 0.06518055012384305,
"grad_norm": 2.2528598308563232,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.9006,
"step": 125
},
{
"epoch": 0.06570199452483379,
"grad_norm": 1.7022476196289062,
"learning_rate": 3.7800000000000004e-05,
"loss": 0.763,
"step": 126
},
{
"epoch": 0.06622343892582454,
"grad_norm": 2.2806150913238525,
"learning_rate": 3.8100000000000005e-05,
"loss": 0.968,
"step": 127
},
{
"epoch": 0.06674488332681527,
"grad_norm": 1.5035136938095093,
"learning_rate": 3.8400000000000005e-05,
"loss": 0.7268,
"step": 128
},
{
"epoch": 0.06726632772780602,
"grad_norm": 1.94576096534729,
"learning_rate": 3.87e-05,
"loss": 0.7615,
"step": 129
},
{
"epoch": 0.06778777212879676,
"grad_norm": 1.336945652961731,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.6875,
"step": 130
},
{
"epoch": 0.06830921652978751,
"grad_norm": 1.500450849533081,
"learning_rate": 3.93e-05,
"loss": 0.7817,
"step": 131
},
{
"epoch": 0.06883066093077826,
"grad_norm": 1.852668046951294,
"learning_rate": 3.96e-05,
"loss": 0.8267,
"step": 132
},
{
"epoch": 0.069352105331769,
"grad_norm": 1.9518615007400513,
"learning_rate": 3.990000000000001e-05,
"loss": 0.897,
"step": 133
},
{
"epoch": 0.06987354973275975,
"grad_norm": 1.9608622789382935,
"learning_rate": 4.02e-05,
"loss": 0.956,
"step": 134
},
{
"epoch": 0.07039499413375049,
"grad_norm": 1.6532974243164062,
"learning_rate": 4.05e-05,
"loss": 0.8116,
"step": 135
},
{
"epoch": 0.07091643853474124,
"grad_norm": 1.9795348644256592,
"learning_rate": 4.08e-05,
"loss": 0.9266,
"step": 136
},
{
"epoch": 0.07143788293573197,
"grad_norm": 1.6830356121063232,
"learning_rate": 4.11e-05,
"loss": 0.7564,
"step": 137
},
{
"epoch": 0.07195932733672272,
"grad_norm": 2.111184597015381,
"learning_rate": 4.1400000000000003e-05,
"loss": 0.9741,
"step": 138
},
{
"epoch": 0.07248077173771347,
"grad_norm": 2.1398720741271973,
"learning_rate": 4.1700000000000004e-05,
"loss": 0.9081,
"step": 139
},
{
"epoch": 0.07300221613870421,
"grad_norm": 1.9623899459838867,
"learning_rate": 4.2000000000000004e-05,
"loss": 0.8844,
"step": 140
},
{
"epoch": 0.07352366053969496,
"grad_norm": 2.0776987075805664,
"learning_rate": 4.23e-05,
"loss": 0.9306,
"step": 141
},
{
"epoch": 0.0740451049406857,
"grad_norm": 2.0103209018707275,
"learning_rate": 4.2600000000000005e-05,
"loss": 0.8967,
"step": 142
},
{
"epoch": 0.07456654934167645,
"grad_norm": 1.9392935037612915,
"learning_rate": 4.2900000000000006e-05,
"loss": 0.8938,
"step": 143
},
{
"epoch": 0.07508799374266718,
"grad_norm": 2.041646957397461,
"learning_rate": 4.32e-05,
"loss": 0.9007,
"step": 144
},
{
"epoch": 0.07560943814365793,
"grad_norm": 2.416090726852417,
"learning_rate": 4.35e-05,
"loss": 1.096,
"step": 145
},
{
"epoch": 0.07613088254464867,
"grad_norm": 2.0825910568237305,
"learning_rate": 4.380000000000001e-05,
"loss": 1.0315,
"step": 146
},
{
"epoch": 0.07665232694563942,
"grad_norm": 3.348813533782959,
"learning_rate": 4.41e-05,
"loss": 0.9972,
"step": 147
},
{
"epoch": 0.07717377134663017,
"grad_norm": 3.285196304321289,
"learning_rate": 4.44e-05,
"loss": 1.051,
"step": 148
},
{
"epoch": 0.07769521574762091,
"grad_norm": 2.2454581260681152,
"learning_rate": 4.47e-05,
"loss": 0.9835,
"step": 149
},
{
"epoch": 0.07821666014861166,
"grad_norm": 2.6450259685516357,
"learning_rate": 4.5e-05,
"loss": 1.0683,
"step": 150
},
{
"epoch": 0.07821666014861166,
"eval_loss": 0.798732578754425,
"eval_runtime": 326.8195,
"eval_samples_per_second": 19.766,
"eval_steps_per_second": 4.942,
"step": 150
},
{
"epoch": 0.0787381045496024,
"grad_norm": 1.49418306350708,
"learning_rate": 4.5299999999999997e-05,
"loss": 0.4889,
"step": 151
},
{
"epoch": 0.07925954895059314,
"grad_norm": 1.3698493242263794,
"learning_rate": 4.5600000000000004e-05,
"loss": 0.5986,
"step": 152
},
{
"epoch": 0.07978099335158388,
"grad_norm": 0.803355872631073,
"learning_rate": 4.5900000000000004e-05,
"loss": 0.2633,
"step": 153
},
{
"epoch": 0.08030243775257463,
"grad_norm": 0.6260421276092529,
"learning_rate": 4.62e-05,
"loss": 0.242,
"step": 154
},
{
"epoch": 0.08082388215356538,
"grad_norm": 0.6748026013374329,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.3132,
"step": 155
},
{
"epoch": 0.08134532655455612,
"grad_norm": 1.0539573431015015,
"learning_rate": 4.6800000000000006e-05,
"loss": 0.4188,
"step": 156
},
{
"epoch": 0.08186677095554687,
"grad_norm": 1.2083957195281982,
"learning_rate": 4.71e-05,
"loss": 0.4227,
"step": 157
},
{
"epoch": 0.0823882153565376,
"grad_norm": 1.3534812927246094,
"learning_rate": 4.74e-05,
"loss": 0.7787,
"step": 158
},
{
"epoch": 0.08290965975752836,
"grad_norm": 1.4358162879943848,
"learning_rate": 4.770000000000001e-05,
"loss": 0.6399,
"step": 159
},
{
"epoch": 0.08343110415851909,
"grad_norm": 1.3026161193847656,
"learning_rate": 4.8e-05,
"loss": 0.7389,
"step": 160
},
{
"epoch": 0.08395254855950984,
"grad_norm": 1.1461225748062134,
"learning_rate": 4.83e-05,
"loss": 0.6116,
"step": 161
},
{
"epoch": 0.0844739929605006,
"grad_norm": 1.6268322467803955,
"learning_rate": 4.860000000000001e-05,
"loss": 0.873,
"step": 162
},
{
"epoch": 0.08499543736149133,
"grad_norm": 1.3255441188812256,
"learning_rate": 4.89e-05,
"loss": 0.6859,
"step": 163
},
{
"epoch": 0.08551688176248208,
"grad_norm": 1.3837733268737793,
"learning_rate": 4.92e-05,
"loss": 0.7706,
"step": 164
},
{
"epoch": 0.08603832616347282,
"grad_norm": 1.6087336540222168,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.757,
"step": 165
},
{
"epoch": 0.08655977056446357,
"grad_norm": 1.297447681427002,
"learning_rate": 4.9800000000000004e-05,
"loss": 0.6959,
"step": 166
},
{
"epoch": 0.0870812149654543,
"grad_norm": 1.6905202865600586,
"learning_rate": 5.01e-05,
"loss": 0.7988,
"step": 167
},
{
"epoch": 0.08760265936644505,
"grad_norm": 1.275194764137268,
"learning_rate": 5.0400000000000005e-05,
"loss": 0.6706,
"step": 168
},
{
"epoch": 0.08812410376743579,
"grad_norm": 1.92887282371521,
"learning_rate": 5.0700000000000006e-05,
"loss": 0.9816,
"step": 169
},
{
"epoch": 0.08864554816842654,
"grad_norm": 1.59506356716156,
"learning_rate": 5.1e-05,
"loss": 0.7933,
"step": 170
},
{
"epoch": 0.08916699256941729,
"grad_norm": 1.4300225973129272,
"learning_rate": 5.13e-05,
"loss": 0.672,
"step": 171
},
{
"epoch": 0.08968843697040803,
"grad_norm": 1.4769190549850464,
"learning_rate": 5.160000000000001e-05,
"loss": 0.7425,
"step": 172
},
{
"epoch": 0.09020988137139878,
"grad_norm": 1.579256296157837,
"learning_rate": 5.19e-05,
"loss": 0.8161,
"step": 173
},
{
"epoch": 0.09073132577238952,
"grad_norm": 1.493124008178711,
"learning_rate": 5.22e-05,
"loss": 0.7736,
"step": 174
},
{
"epoch": 0.09125277017338027,
"grad_norm": 1.3196746110916138,
"learning_rate": 5.250000000000001e-05,
"loss": 0.7727,
"step": 175
},
{
"epoch": 0.091774214574371,
"grad_norm": 1.5161240100860596,
"learning_rate": 5.28e-05,
"loss": 0.8462,
"step": 176
},
{
"epoch": 0.09229565897536175,
"grad_norm": 1.5585005283355713,
"learning_rate": 5.31e-05,
"loss": 0.7596,
"step": 177
},
{
"epoch": 0.0928171033763525,
"grad_norm": 1.3152117729187012,
"learning_rate": 5.340000000000001e-05,
"loss": 0.6694,
"step": 178
},
{
"epoch": 0.09333854777734324,
"grad_norm": 1.369708776473999,
"learning_rate": 5.3700000000000004e-05,
"loss": 0.744,
"step": 179
},
{
"epoch": 0.09385999217833399,
"grad_norm": 1.481123924255371,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.8273,
"step": 180
},
{
"epoch": 0.09438143657932473,
"grad_norm": 1.6673870086669922,
"learning_rate": 5.4300000000000005e-05,
"loss": 0.9154,
"step": 181
},
{
"epoch": 0.09490288098031548,
"grad_norm": 1.506094217300415,
"learning_rate": 5.4600000000000006e-05,
"loss": 0.7489,
"step": 182
},
{
"epoch": 0.09542432538130621,
"grad_norm": 1.2892239093780518,
"learning_rate": 5.49e-05,
"loss": 0.6577,
"step": 183
},
{
"epoch": 0.09594576978229696,
"grad_norm": 1.4567745923995972,
"learning_rate": 5.52e-05,
"loss": 0.6866,
"step": 184
},
{
"epoch": 0.0964672141832877,
"grad_norm": 1.7683115005493164,
"learning_rate": 5.550000000000001e-05,
"loss": 0.8814,
"step": 185
},
{
"epoch": 0.09698865858427845,
"grad_norm": 1.5696678161621094,
"learning_rate": 5.58e-05,
"loss": 0.7922,
"step": 186
},
{
"epoch": 0.0975101029852692,
"grad_norm": 1.6422466039657593,
"learning_rate": 5.61e-05,
"loss": 0.856,
"step": 187
},
{
"epoch": 0.09803154738625994,
"grad_norm": 1.8678178787231445,
"learning_rate": 5.640000000000001e-05,
"loss": 0.9425,
"step": 188
},
{
"epoch": 0.09855299178725069,
"grad_norm": 1.8227742910385132,
"learning_rate": 5.67e-05,
"loss": 0.9673,
"step": 189
},
{
"epoch": 0.09907443618824142,
"grad_norm": 2.0066490173339844,
"learning_rate": 5.7e-05,
"loss": 0.8352,
"step": 190
},
{
"epoch": 0.09959588058923217,
"grad_norm": 2.3402273654937744,
"learning_rate": 5.730000000000001e-05,
"loss": 1.0273,
"step": 191
},
{
"epoch": 0.10011732499022291,
"grad_norm": 1.6430705785751343,
"learning_rate": 5.7600000000000004e-05,
"loss": 0.9703,
"step": 192
},
{
"epoch": 0.10063876939121366,
"grad_norm": 1.6317139863967896,
"learning_rate": 5.7900000000000005e-05,
"loss": 0.9602,
"step": 193
},
{
"epoch": 0.10116021379220441,
"grad_norm": 2.0906260013580322,
"learning_rate": 5.82e-05,
"loss": 1.0714,
"step": 194
},
{
"epoch": 0.10168165819319515,
"grad_norm": 1.8493003845214844,
"learning_rate": 5.8500000000000006e-05,
"loss": 0.8996,
"step": 195
},
{
"epoch": 0.1022031025941859,
"grad_norm": 1.9314156770706177,
"learning_rate": 5.88e-05,
"loss": 0.8453,
"step": 196
},
{
"epoch": 0.10272454699517664,
"grad_norm": 2.586416244506836,
"learning_rate": 5.91e-05,
"loss": 0.9768,
"step": 197
},
{
"epoch": 0.10324599139616739,
"grad_norm": 2.3081066608428955,
"learning_rate": 5.940000000000001e-05,
"loss": 1.1049,
"step": 198
},
{
"epoch": 0.10376743579715812,
"grad_norm": 1.8214553594589233,
"learning_rate": 5.97e-05,
"loss": 0.9555,
"step": 199
},
{
"epoch": 0.10428888019814887,
"grad_norm": 2.6920111179351807,
"learning_rate": 6e-05,
"loss": 1.1088,
"step": 200
},
{
"epoch": 0.10481032459913962,
"grad_norm": 1.3093756437301636,
"learning_rate": 6.030000000000001e-05,
"loss": 0.4705,
"step": 201
},
{
"epoch": 0.10533176900013036,
"grad_norm": 1.7705165147781372,
"learning_rate": 6.06e-05,
"loss": 0.701,
"step": 202
},
{
"epoch": 0.10585321340112111,
"grad_norm": 0.6416582465171814,
"learning_rate": 6.09e-05,
"loss": 0.3082,
"step": 203
},
{
"epoch": 0.10637465780211185,
"grad_norm": 0.7174233794212341,
"learning_rate": 6.120000000000001e-05,
"loss": 0.2373,
"step": 204
},
{
"epoch": 0.1068961022031026,
"grad_norm": 0.8111220002174377,
"learning_rate": 6.15e-05,
"loss": 0.2997,
"step": 205
},
{
"epoch": 0.10741754660409333,
"grad_norm": 0.9542537331581116,
"learning_rate": 6.18e-05,
"loss": 0.4703,
"step": 206
},
{
"epoch": 0.10793899100508408,
"grad_norm": 1.1576972007751465,
"learning_rate": 6.21e-05,
"loss": 0.5591,
"step": 207
},
{
"epoch": 0.10846043540607482,
"grad_norm": 1.1840004920959473,
"learning_rate": 6.240000000000001e-05,
"loss": 0.6656,
"step": 208
},
{
"epoch": 0.10898187980706557,
"grad_norm": 1.0436705350875854,
"learning_rate": 6.27e-05,
"loss": 0.5518,
"step": 209
},
{
"epoch": 0.10950332420805632,
"grad_norm": 1.2548532485961914,
"learning_rate": 6.3e-05,
"loss": 0.6519,
"step": 210
},
{
"epoch": 0.11002476860904706,
"grad_norm": 1.2066162824630737,
"learning_rate": 6.330000000000001e-05,
"loss": 0.6456,
"step": 211
},
{
"epoch": 0.11054621301003781,
"grad_norm": 1.4366339445114136,
"learning_rate": 6.36e-05,
"loss": 0.6932,
"step": 212
},
{
"epoch": 0.11106765741102854,
"grad_norm": 1.3649702072143555,
"learning_rate": 6.39e-05,
"loss": 0.8679,
"step": 213
},
{
"epoch": 0.1115891018120193,
"grad_norm": 1.5654956102371216,
"learning_rate": 6.42e-05,
"loss": 0.756,
"step": 214
},
{
"epoch": 0.11211054621301003,
"grad_norm": 1.8356409072875977,
"learning_rate": 6.450000000000001e-05,
"loss": 0.812,
"step": 215
},
{
"epoch": 0.11263199061400078,
"grad_norm": 1.1078040599822998,
"learning_rate": 6.48e-05,
"loss": 0.6616,
"step": 216
},
{
"epoch": 0.11315343501499153,
"grad_norm": 1.1788630485534668,
"learning_rate": 6.510000000000001e-05,
"loss": 0.6678,
"step": 217
},
{
"epoch": 0.11367487941598227,
"grad_norm": 1.3031374216079712,
"learning_rate": 6.54e-05,
"loss": 0.8129,
"step": 218
},
{
"epoch": 0.11419632381697302,
"grad_norm": 1.143462896347046,
"learning_rate": 6.57e-05,
"loss": 0.6625,
"step": 219
},
{
"epoch": 0.11471776821796376,
"grad_norm": 1.7338825464248657,
"learning_rate": 6.6e-05,
"loss": 0.6778,
"step": 220
},
{
"epoch": 0.1152392126189545,
"grad_norm": 1.1185352802276611,
"learning_rate": 6.630000000000001e-05,
"loss": 0.6101,
"step": 221
},
{
"epoch": 0.11576065701994524,
"grad_norm": 1.209920883178711,
"learning_rate": 6.66e-05,
"loss": 0.655,
"step": 222
},
{
"epoch": 0.116282101420936,
"grad_norm": 1.3095269203186035,
"learning_rate": 6.69e-05,
"loss": 0.7194,
"step": 223
},
{
"epoch": 0.11680354582192674,
"grad_norm": 1.419487476348877,
"learning_rate": 6.720000000000001e-05,
"loss": 0.7749,
"step": 224
},
{
"epoch": 0.11732499022291748,
"grad_norm": 1.5805034637451172,
"learning_rate": 6.75e-05,
"loss": 0.8271,
"step": 225
},
{
"epoch": 0.11784643462390823,
"grad_norm": 1.448123812675476,
"learning_rate": 6.78e-05,
"loss": 0.821,
"step": 226
},
{
"epoch": 0.11836787902489897,
"grad_norm": 1.3406933546066284,
"learning_rate": 6.81e-05,
"loss": 0.7887,
"step": 227
},
{
"epoch": 0.11888932342588972,
"grad_norm": 1.478799819946289,
"learning_rate": 6.840000000000001e-05,
"loss": 0.8718,
"step": 228
},
{
"epoch": 0.11941076782688045,
"grad_norm": 1.4787107706069946,
"learning_rate": 6.87e-05,
"loss": 0.7741,
"step": 229
},
{
"epoch": 0.1199322122278712,
"grad_norm": 1.3887667655944824,
"learning_rate": 6.900000000000001e-05,
"loss": 0.7922,
"step": 230
},
{
"epoch": 0.12045365662886194,
"grad_norm": 1.1378411054611206,
"learning_rate": 6.93e-05,
"loss": 0.7046,
"step": 231
},
{
"epoch": 0.12097510102985269,
"grad_norm": 1.34485924243927,
"learning_rate": 6.96e-05,
"loss": 0.7417,
"step": 232
},
{
"epoch": 0.12149654543084344,
"grad_norm": 1.3890823125839233,
"learning_rate": 6.99e-05,
"loss": 0.7552,
"step": 233
},
{
"epoch": 0.12201798983183418,
"grad_norm": 1.3711472749710083,
"learning_rate": 7.020000000000001e-05,
"loss": 0.6796,
"step": 234
},
{
"epoch": 0.12253943423282493,
"grad_norm": 1.1514718532562256,
"learning_rate": 7.05e-05,
"loss": 0.6841,
"step": 235
},
{
"epoch": 0.12306087863381567,
"grad_norm": 1.24899423122406,
"learning_rate": 7.08e-05,
"loss": 0.6578,
"step": 236
},
{
"epoch": 0.12358232303480642,
"grad_norm": 1.4886633157730103,
"learning_rate": 7.110000000000001e-05,
"loss": 0.753,
"step": 237
},
{
"epoch": 0.12410376743579715,
"grad_norm": 1.5798087120056152,
"learning_rate": 7.14e-05,
"loss": 0.9315,
"step": 238
},
{
"epoch": 0.1246252118367879,
"grad_norm": 1.6789343357086182,
"learning_rate": 7.170000000000001e-05,
"loss": 0.9131,
"step": 239
},
{
"epoch": 0.12514665623777865,
"grad_norm": 1.7360730171203613,
"learning_rate": 7.2e-05,
"loss": 0.7529,
"step": 240
},
{
"epoch": 0.1256681006387694,
"grad_norm": 1.584789752960205,
"learning_rate": 7.230000000000001e-05,
"loss": 0.8746,
"step": 241
},
{
"epoch": 0.12618954503976013,
"grad_norm": 1.3675271272659302,
"learning_rate": 7.26e-05,
"loss": 0.8015,
"step": 242
},
{
"epoch": 0.12671098944075088,
"grad_norm": 1.3785395622253418,
"learning_rate": 7.290000000000001e-05,
"loss": 0.8389,
"step": 243
},
{
"epoch": 0.12723243384174163,
"grad_norm": 1.5574450492858887,
"learning_rate": 7.32e-05,
"loss": 0.9246,
"step": 244
},
{
"epoch": 0.12775387824273238,
"grad_norm": 1.614886999130249,
"learning_rate": 7.35e-05,
"loss": 0.919,
"step": 245
},
{
"epoch": 0.1282753226437231,
"grad_norm": 2.001286506652832,
"learning_rate": 7.38e-05,
"loss": 1.0433,
"step": 246
},
{
"epoch": 0.12879676704471385,
"grad_norm": 1.5525065660476685,
"learning_rate": 7.410000000000001e-05,
"loss": 0.8605,
"step": 247
},
{
"epoch": 0.1293182114457046,
"grad_norm": 2.074171304702759,
"learning_rate": 7.44e-05,
"loss": 1.0522,
"step": 248
},
{
"epoch": 0.12983965584669535,
"grad_norm": 1.8862979412078857,
"learning_rate": 7.47e-05,
"loss": 1.0324,
"step": 249
},
{
"epoch": 0.1303611002476861,
"grad_norm": 2.677232265472412,
"learning_rate": 7.500000000000001e-05,
"loss": 0.9479,
"step": 250
},
{
"epoch": 0.13088254464867682,
"grad_norm": 0.9016095995903015,
"learning_rate": 7.53e-05,
"loss": 0.4123,
"step": 251
},
{
"epoch": 0.13140398904966757,
"grad_norm": 1.1711385250091553,
"learning_rate": 7.560000000000001e-05,
"loss": 0.611,
"step": 252
},
{
"epoch": 0.13192543345065832,
"grad_norm": 0.638198971748352,
"learning_rate": 7.590000000000002e-05,
"loss": 0.3055,
"step": 253
},
{
"epoch": 0.13244687785164908,
"grad_norm": 0.5108641982078552,
"learning_rate": 7.620000000000001e-05,
"loss": 0.2203,
"step": 254
},
{
"epoch": 0.13296832225263983,
"grad_norm": 0.6586244106292725,
"learning_rate": 7.65e-05,
"loss": 0.2306,
"step": 255
},
{
"epoch": 0.13348976665363055,
"grad_norm": 0.6634657382965088,
"learning_rate": 7.680000000000001e-05,
"loss": 0.2932,
"step": 256
},
{
"epoch": 0.1340112110546213,
"grad_norm": 0.8705341219902039,
"learning_rate": 7.71e-05,
"loss": 0.4632,
"step": 257
},
{
"epoch": 0.13453265545561205,
"grad_norm": 0.8588047027587891,
"learning_rate": 7.74e-05,
"loss": 0.5355,
"step": 258
},
{
"epoch": 0.1350540998566028,
"grad_norm": 0.929277241230011,
"learning_rate": 7.77e-05,
"loss": 0.5483,
"step": 259
},
{
"epoch": 0.13557554425759352,
"grad_norm": 0.911803126335144,
"learning_rate": 7.800000000000001e-05,
"loss": 0.4796,
"step": 260
},
{
"epoch": 0.13609698865858427,
"grad_norm": 1.0001978874206543,
"learning_rate": 7.83e-05,
"loss": 0.5725,
"step": 261
},
{
"epoch": 0.13661843305957502,
"grad_norm": 1.283967137336731,
"learning_rate": 7.86e-05,
"loss": 0.7197,
"step": 262
},
{
"epoch": 0.13713987746056577,
"grad_norm": 1.0027309656143188,
"learning_rate": 7.890000000000001e-05,
"loss": 0.494,
"step": 263
},
{
"epoch": 0.13766132186155652,
"grad_norm": 1.1058956384658813,
"learning_rate": 7.92e-05,
"loss": 0.7252,
"step": 264
},
{
"epoch": 0.13818276626254725,
"grad_norm": 1.6565499305725098,
"learning_rate": 7.950000000000001e-05,
"loss": 0.6628,
"step": 265
},
{
"epoch": 0.138704210663538,
"grad_norm": 1.349332571029663,
"learning_rate": 7.980000000000002e-05,
"loss": 0.7285,
"step": 266
},
{
"epoch": 0.13922565506452875,
"grad_norm": 1.1405296325683594,
"learning_rate": 8.010000000000001e-05,
"loss": 0.6503,
"step": 267
},
{
"epoch": 0.1397470994655195,
"grad_norm": 1.5732964277267456,
"learning_rate": 8.04e-05,
"loss": 0.7544,
"step": 268
},
{
"epoch": 0.14026854386651022,
"grad_norm": 1.1660890579223633,
"learning_rate": 8.07e-05,
"loss": 0.6631,
"step": 269
},
{
"epoch": 0.14078998826750097,
"grad_norm": 1.1596927642822266,
"learning_rate": 8.1e-05,
"loss": 0.6831,
"step": 270
},
{
"epoch": 0.14131143266849172,
"grad_norm": 1.1583274602890015,
"learning_rate": 8.13e-05,
"loss": 0.6867,
"step": 271
},
{
"epoch": 0.14183287706948247,
"grad_norm": 1.351112723350525,
"learning_rate": 8.16e-05,
"loss": 0.8328,
"step": 272
},
{
"epoch": 0.14235432147047322,
"grad_norm": 1.3185946941375732,
"learning_rate": 8.190000000000001e-05,
"loss": 0.6922,
"step": 273
},
{
"epoch": 0.14287576587146394,
"grad_norm": 2.4567134380340576,
"learning_rate": 8.22e-05,
"loss": 0.8027,
"step": 274
},
{
"epoch": 0.1433972102724547,
"grad_norm": 1.0922107696533203,
"learning_rate": 8.25e-05,
"loss": 0.6832,
"step": 275
},
{
"epoch": 0.14391865467344545,
"grad_norm": 1.436450719833374,
"learning_rate": 8.280000000000001e-05,
"loss": 0.9163,
"step": 276
},
{
"epoch": 0.1444400990744362,
"grad_norm": 1.120586633682251,
"learning_rate": 8.31e-05,
"loss": 0.5632,
"step": 277
},
{
"epoch": 0.14496154347542695,
"grad_norm": 1.3133465051651,
"learning_rate": 8.340000000000001e-05,
"loss": 0.7018,
"step": 278
},
{
"epoch": 0.14548298787641767,
"grad_norm": 1.1073719263076782,
"learning_rate": 8.370000000000002e-05,
"loss": 0.7195,
"step": 279
},
{
"epoch": 0.14600443227740842,
"grad_norm": 1.1922098398208618,
"learning_rate": 8.400000000000001e-05,
"loss": 0.7307,
"step": 280
},
{
"epoch": 0.14652587667839917,
"grad_norm": 1.2493211030960083,
"learning_rate": 8.43e-05,
"loss": 0.9021,
"step": 281
},
{
"epoch": 0.14704732107938992,
"grad_norm": 1.2321152687072754,
"learning_rate": 8.46e-05,
"loss": 0.7489,
"step": 282
},
{
"epoch": 0.14756876548038064,
"grad_norm": 1.1605387926101685,
"learning_rate": 8.49e-05,
"loss": 0.6323,
"step": 283
},
{
"epoch": 0.1480902098813714,
"grad_norm": 1.3138997554779053,
"learning_rate": 8.520000000000001e-05,
"loss": 0.6877,
"step": 284
},
{
"epoch": 0.14861165428236214,
"grad_norm": 1.4798433780670166,
"learning_rate": 8.55e-05,
"loss": 0.9398,
"step": 285
},
{
"epoch": 0.1491330986833529,
"grad_norm": 1.557124376296997,
"learning_rate": 8.580000000000001e-05,
"loss": 0.8886,
"step": 286
},
{
"epoch": 0.14965454308434364,
"grad_norm": 1.216688871383667,
"learning_rate": 8.61e-05,
"loss": 0.77,
"step": 287
},
{
"epoch": 0.15017598748533437,
"grad_norm": 1.1777106523513794,
"learning_rate": 8.64e-05,
"loss": 0.7147,
"step": 288
},
{
"epoch": 0.15069743188632512,
"grad_norm": 1.3893619775772095,
"learning_rate": 8.67e-05,
"loss": 0.8269,
"step": 289
},
{
"epoch": 0.15121887628731587,
"grad_norm": 1.4378941059112549,
"learning_rate": 8.7e-05,
"loss": 0.8203,
"step": 290
},
{
"epoch": 0.15174032068830662,
"grad_norm": 1.389863133430481,
"learning_rate": 8.730000000000001e-05,
"loss": 0.8993,
"step": 291
},
{
"epoch": 0.15226176508929734,
"grad_norm": 1.3715370893478394,
"learning_rate": 8.760000000000002e-05,
"loss": 0.8406,
"step": 292
},
{
"epoch": 0.1527832094902881,
"grad_norm": 1.3929187059402466,
"learning_rate": 8.790000000000001e-05,
"loss": 0.785,
"step": 293
},
{
"epoch": 0.15330465389127884,
"grad_norm": 1.5916658639907837,
"learning_rate": 8.82e-05,
"loss": 0.9503,
"step": 294
},
{
"epoch": 0.1538260982922696,
"grad_norm": 1.4390002489089966,
"learning_rate": 8.85e-05,
"loss": 0.8936,
"step": 295
},
{
"epoch": 0.15434754269326034,
"grad_norm": 1.450700044631958,
"learning_rate": 8.88e-05,
"loss": 0.9442,
"step": 296
},
{
"epoch": 0.15486898709425106,
"grad_norm": 1.267523169517517,
"learning_rate": 8.910000000000001e-05,
"loss": 0.8459,
"step": 297
},
{
"epoch": 0.15539043149524182,
"grad_norm": 1.91372811794281,
"learning_rate": 8.94e-05,
"loss": 0.9787,
"step": 298
},
{
"epoch": 0.15591187589623257,
"grad_norm": 1.8635175228118896,
"learning_rate": 8.970000000000001e-05,
"loss": 0.8864,
"step": 299
},
{
"epoch": 0.15643332029722332,
"grad_norm": 2.4946999549865723,
"learning_rate": 9e-05,
"loss": 1.0969,
"step": 300
},
{
"epoch": 0.15643332029722332,
"eval_loss": 0.7402325868606567,
"eval_runtime": 326.3494,
"eval_samples_per_second": 19.795,
"eval_steps_per_second": 4.949,
"step": 300
},
{
"epoch": 0.15695476469821407,
"grad_norm": 0.9152140617370605,
"learning_rate": 8.999753262144806e-05,
"loss": 0.4045,
"step": 301
},
{
"epoch": 0.1574762090992048,
"grad_norm": 0.6910588145256042,
"learning_rate": 8.999013075636805e-05,
"loss": 0.3735,
"step": 302
},
{
"epoch": 0.15799765350019554,
"grad_norm": 0.5631867051124573,
"learning_rate": 8.997779521645793e-05,
"loss": 0.2266,
"step": 303
},
{
"epoch": 0.1585190979011863,
"grad_norm": 0.48377081751823425,
"learning_rate": 8.996052735444863e-05,
"loss": 0.2233,
"step": 304
},
{
"epoch": 0.15904054230217704,
"grad_norm": 0.6671403646469116,
"learning_rate": 8.993832906395582e-05,
"loss": 0.358,
"step": 305
},
{
"epoch": 0.15956198670316776,
"grad_norm": 0.8236808180809021,
"learning_rate": 8.991120277927223e-05,
"loss": 0.4056,
"step": 306
},
{
"epoch": 0.1600834311041585,
"grad_norm": 1.044965386390686,
"learning_rate": 8.987915147510061e-05,
"loss": 0.6135,
"step": 307
},
{
"epoch": 0.16060487550514926,
"grad_norm": 1.1093504428863525,
"learning_rate": 8.98421786662277e-05,
"loss": 0.675,
"step": 308
},
{
"epoch": 0.16112631990614001,
"grad_norm": 1.1360151767730713,
"learning_rate": 8.98002884071386e-05,
"loss": 0.7139,
"step": 309
},
{
"epoch": 0.16164776430713076,
"grad_norm": 0.9347630739212036,
"learning_rate": 8.97534852915723e-05,
"loss": 0.5709,
"step": 310
},
{
"epoch": 0.1621692087081215,
"grad_norm": 1.0603522062301636,
"learning_rate": 8.970177445201784e-05,
"loss": 0.6105,
"step": 311
},
{
"epoch": 0.16269065310911224,
"grad_norm": 1.0385960340499878,
"learning_rate": 8.964516155915151e-05,
"loss": 0.6678,
"step": 312
},
{
"epoch": 0.163212097510103,
"grad_norm": 1.2471129894256592,
"learning_rate": 8.958365282121497e-05,
"loss": 0.6955,
"step": 313
},
{
"epoch": 0.16373354191109374,
"grad_norm": 1.2143276929855347,
"learning_rate": 8.951725498333449e-05,
"loss": 0.6995,
"step": 314
},
{
"epoch": 0.16425498631208446,
"grad_norm": 1.2376823425292969,
"learning_rate": 8.94459753267812e-05,
"loss": 0.8036,
"step": 315
},
{
"epoch": 0.1647764307130752,
"grad_norm": 1.17597234249115,
"learning_rate": 8.936982166817273e-05,
"loss": 0.7579,
"step": 316
},
{
"epoch": 0.16529787511406596,
"grad_norm": 1.0826165676116943,
"learning_rate": 8.928880235861588e-05,
"loss": 0.6702,
"step": 317
},
{
"epoch": 0.1658193195150567,
"grad_norm": 1.3289791345596313,
"learning_rate": 8.9202926282791e-05,
"loss": 0.8136,
"step": 318
},
{
"epoch": 0.16634076391604746,
"grad_norm": 1.95395827293396,
"learning_rate": 8.911220285797748e-05,
"loss": 0.8678,
"step": 319
},
{
"epoch": 0.16686220831703819,
"grad_norm": 1.2291593551635742,
"learning_rate": 8.901664203302126e-05,
"loss": 0.7329,
"step": 320
},
{
"epoch": 0.16738365271802894,
"grad_norm": 1.111344337463379,
"learning_rate": 8.891625428724365e-05,
"loss": 0.7421,
"step": 321
},
{
"epoch": 0.1679050971190197,
"grad_norm": 1.2865890264511108,
"learning_rate": 8.881105062929222e-05,
"loss": 0.8722,
"step": 322
},
{
"epoch": 0.16842654152001044,
"grad_norm": 1.30595064163208,
"learning_rate": 8.870104259593363e-05,
"loss": 0.8715,
"step": 323
},
{
"epoch": 0.1689479859210012,
"grad_norm": 1.0691040754318237,
"learning_rate": 8.858624225078841e-05,
"loss": 0.6839,
"step": 324
},
{
"epoch": 0.1694694303219919,
"grad_norm": 1.2839759588241577,
"learning_rate": 8.846666218300807e-05,
"loss": 0.7865,
"step": 325
},
{
"epoch": 0.16999087472298266,
"grad_norm": 1.0126097202301025,
"learning_rate": 8.834231550589462e-05,
"loss": 0.6566,
"step": 326
},
{
"epoch": 0.1705123191239734,
"grad_norm": 1.189987301826477,
"learning_rate": 8.821321585546244e-05,
"loss": 0.7741,
"step": 327
},
{
"epoch": 0.17103376352496416,
"grad_norm": 1.129711389541626,
"learning_rate": 8.807937738894302e-05,
"loss": 0.6625,
"step": 328
},
{
"epoch": 0.17155520792595488,
"grad_norm": 1.4810699224472046,
"learning_rate": 8.794081478323246e-05,
"loss": 0.8111,
"step": 329
},
{
"epoch": 0.17207665232694563,
"grad_norm": 1.334193468093872,
"learning_rate": 8.779754323328193e-05,
"loss": 0.7845,
"step": 330
},
{
"epoch": 0.17259809672793638,
"grad_norm": 1.2970231771469116,
"learning_rate": 8.764957845043137e-05,
"loss": 0.7514,
"step": 331
},
{
"epoch": 0.17311954112892713,
"grad_norm": 1.3496140241622925,
"learning_rate": 8.749693666068665e-05,
"loss": 0.7859,
"step": 332
},
{
"epoch": 0.17364098552991789,
"grad_norm": 1.1188700199127197,
"learning_rate": 8.733963460294016e-05,
"loss": 0.749,
"step": 333
},
{
"epoch": 0.1741624299309086,
"grad_norm": 1.1531932353973389,
"learning_rate": 8.717768952713513e-05,
"loss": 0.7462,
"step": 334
},
{
"epoch": 0.17468387433189936,
"grad_norm": 1.593352198600769,
"learning_rate": 8.701111919237408e-05,
"loss": 0.9956,
"step": 335
},
{
"epoch": 0.1752053187328901,
"grad_norm": 1.2148069143295288,
"learning_rate": 8.683994186497131e-05,
"loss": 0.7376,
"step": 336
},
{
"epoch": 0.17572676313388086,
"grad_norm": 1.196800708770752,
"learning_rate": 8.666417631644977e-05,
"loss": 0.8357,
"step": 337
},
{
"epoch": 0.17624820753487158,
"grad_norm": 2.3671152591705322,
"learning_rate": 8.648384182148252e-05,
"loss": 0.802,
"step": 338
},
{
"epoch": 0.17676965193586233,
"grad_norm": 1.3539149761199951,
"learning_rate": 8.629895815577916e-05,
"loss": 0.7685,
"step": 339
},
{
"epoch": 0.17729109633685308,
"grad_norm": 1.4018131494522095,
"learning_rate": 8.610954559391704e-05,
"loss": 0.8006,
"step": 340
},
{
"epoch": 0.17781254073784383,
"grad_norm": 1.288794994354248,
"learning_rate": 8.59156249071181e-05,
"loss": 0.7986,
"step": 341
},
{
"epoch": 0.17833398513883458,
"grad_norm": 1.3666220903396606,
"learning_rate": 8.571721736097089e-05,
"loss": 0.8825,
"step": 342
},
{
"epoch": 0.1788554295398253,
"grad_norm": 1.6995041370391846,
"learning_rate": 8.551434471309872e-05,
"loss": 0.7511,
"step": 343
},
{
"epoch": 0.17937687394081606,
"grad_norm": 1.766118049621582,
"learning_rate": 8.530702921077359e-05,
"loss": 0.8466,
"step": 344
},
{
"epoch": 0.1798983183418068,
"grad_norm": 1.5338678359985352,
"learning_rate": 8.509529358847655e-05,
"loss": 0.8606,
"step": 345
},
{
"epoch": 0.18041976274279756,
"grad_norm": 1.4264774322509766,
"learning_rate": 8.487916106540466e-05,
"loss": 0.9154,
"step": 346
},
{
"epoch": 0.1809412071437883,
"grad_norm": 1.4423105716705322,
"learning_rate": 8.465865534292465e-05,
"loss": 0.8203,
"step": 347
},
{
"epoch": 0.18146265154477903,
"grad_norm": 1.6251096725463867,
"learning_rate": 8.443380060197386e-05,
"loss": 0.9229,
"step": 348
},
{
"epoch": 0.18198409594576978,
"grad_norm": 1.8085907697677612,
"learning_rate": 8.420462150040853e-05,
"loss": 0.9639,
"step": 349
},
{
"epoch": 0.18250554034676053,
"grad_norm": 1.7975590229034424,
"learning_rate": 8.397114317029975e-05,
"loss": 0.9465,
"step": 350
},
{
"epoch": 0.18302698474775128,
"grad_norm": 0.7071613073348999,
"learning_rate": 8.373339121517747e-05,
"loss": 0.3601,
"step": 351
},
{
"epoch": 0.183548429148742,
"grad_norm": 0.8708938360214233,
"learning_rate": 8.34913917072228e-05,
"loss": 0.4189,
"step": 352
},
{
"epoch": 0.18406987354973275,
"grad_norm": 0.4563164710998535,
"learning_rate": 8.324517118440889e-05,
"loss": 0.2105,
"step": 353
},
{
"epoch": 0.1845913179507235,
"grad_norm": 0.4798504114151001,
"learning_rate": 8.299475664759068e-05,
"loss": 0.1919,
"step": 354
},
{
"epoch": 0.18511276235171426,
"grad_norm": 0.7082318663597107,
"learning_rate": 8.274017555754409e-05,
"loss": 0.3147,
"step": 355
},
{
"epoch": 0.185634206752705,
"grad_norm": 0.7861395478248596,
"learning_rate": 8.248145583195448e-05,
"loss": 0.441,
"step": 356
},
{
"epoch": 0.18615565115369573,
"grad_norm": 0.7495299577713013,
"learning_rate": 8.221862584235528e-05,
"loss": 0.4194,
"step": 357
},
{
"epoch": 0.18667709555468648,
"grad_norm": 1.1042672395706177,
"learning_rate": 8.195171441101669e-05,
"loss": 0.6837,
"step": 358
},
{
"epoch": 0.18719853995567723,
"grad_norm": 0.9663426280021667,
"learning_rate": 8.168075080778494e-05,
"loss": 0.5631,
"step": 359
},
{
"epoch": 0.18771998435666798,
"grad_norm": 1.1327295303344727,
"learning_rate": 8.140576474687264e-05,
"loss": 0.7862,
"step": 360
},
{
"epoch": 0.1882414287576587,
"grad_norm": 1.2756463289260864,
"learning_rate": 8.112678638360016e-05,
"loss": 0.755,
"step": 361
},
{
"epoch": 0.18876287315864945,
"grad_norm": 1.0837277173995972,
"learning_rate": 8.084384631108883e-05,
"loss": 0.6897,
"step": 362
},
{
"epoch": 0.1892843175596402,
"grad_norm": 1.006117343902588,
"learning_rate": 8.055697555690607e-05,
"loss": 0.708,
"step": 363
},
{
"epoch": 0.18980576196063095,
"grad_norm": 1.0935845375061035,
"learning_rate": 8.02662055796628e-05,
"loss": 0.6253,
"step": 364
},
{
"epoch": 0.1903272063616217,
"grad_norm": 1.0223681926727295,
"learning_rate": 7.99715682655637e-05,
"loss": 0.6429,
"step": 365
},
{
"epoch": 0.19084865076261243,
"grad_norm": 1.057639718055725,
"learning_rate": 7.967309592491052e-05,
"loss": 0.7681,
"step": 366
},
{
"epoch": 0.19137009516360318,
"grad_norm": 1.2563397884368896,
"learning_rate": 7.937082128855891e-05,
"loss": 0.7278,
"step": 367
},
{
"epoch": 0.19189153956459393,
"grad_norm": 1.2409113645553589,
"learning_rate": 7.906477750432904e-05,
"loss": 0.6566,
"step": 368
},
{
"epoch": 0.19241298396558468,
"grad_norm": 1.2019901275634766,
"learning_rate": 7.875499813337069e-05,
"loss": 0.8036,
"step": 369
},
{
"epoch": 0.1929344283665754,
"grad_norm": 0.9332624077796936,
"learning_rate": 7.844151714648274e-05,
"loss": 0.5595,
"step": 370
},
{
"epoch": 0.19345587276756615,
"grad_norm": 1.1587673425674438,
"learning_rate": 7.812436892038805e-05,
"loss": 0.6916,
"step": 371
},
{
"epoch": 0.1939773171685569,
"grad_norm": 1.2590214014053345,
"learning_rate": 7.780358823396352e-05,
"loss": 0.7037,
"step": 372
},
{
"epoch": 0.19449876156954765,
"grad_norm": 1.5824557542800903,
"learning_rate": 7.747921026442631e-05,
"loss": 0.8385,
"step": 373
},
{
"epoch": 0.1950202059705384,
"grad_norm": 1.312893271446228,
"learning_rate": 7.715127058347615e-05,
"loss": 0.8146,
"step": 374
},
{
"epoch": 0.19554165037152912,
"grad_norm": 1.2608100175857544,
"learning_rate": 7.681980515339464e-05,
"loss": 0.7516,
"step": 375
},
{
"epoch": 0.19606309477251987,
"grad_norm": 1.1570591926574707,
"learning_rate": 7.648485032310145e-05,
"loss": 0.7294,
"step": 376
},
{
"epoch": 0.19658453917351063,
"grad_norm": 1.0426164865493774,
"learning_rate": 7.614644282416831e-05,
"loss": 0.6835,
"step": 377
},
{
"epoch": 0.19710598357450138,
"grad_norm": 1.0416784286499023,
"learning_rate": 7.5804619766791e-05,
"loss": 0.6097,
"step": 378
},
{
"epoch": 0.19762742797549213,
"grad_norm": 1.2640820741653442,
"learning_rate": 7.545941863571974e-05,
"loss": 0.8251,
"step": 379
},
{
"epoch": 0.19814887237648285,
"grad_norm": 1.1552680730819702,
"learning_rate": 7.511087728614862e-05,
"loss": 0.7257,
"step": 380
},
{
"epoch": 0.1986703167774736,
"grad_norm": 1.2071866989135742,
"learning_rate": 7.475903393956434e-05,
"loss": 0.7659,
"step": 381
},
{
"epoch": 0.19919176117846435,
"grad_norm": 1.2022343873977661,
"learning_rate": 7.440392717955476e-05,
"loss": 0.7332,
"step": 382
},
{
"epoch": 0.1997132055794551,
"grad_norm": 1.2143396139144897,
"learning_rate": 7.404559594757779e-05,
"loss": 0.7158,
"step": 383
},
{
"epoch": 0.20023464998044582,
"grad_norm": 1.2017430067062378,
"learning_rate": 7.368407953869104e-05,
"loss": 0.7293,
"step": 384
},
{
"epoch": 0.20075609438143657,
"grad_norm": 1.0218538045883179,
"learning_rate": 7.33194175972427e-05,
"loss": 0.6618,
"step": 385
},
{
"epoch": 0.20127753878242732,
"grad_norm": 1.1736619472503662,
"learning_rate": 7.295165011252397e-05,
"loss": 0.7843,
"step": 386
},
{
"epoch": 0.20179898318341807,
"grad_norm": 1.3254735469818115,
"learning_rate": 7.258081741438396e-05,
"loss": 0.7968,
"step": 387
},
{
"epoch": 0.20232042758440882,
"grad_norm": 1.1880550384521484,
"learning_rate": 7.220696016880688e-05,
"loss": 0.7715,
"step": 388
},
{
"epoch": 0.20284187198539955,
"grad_norm": 1.1296610832214355,
"learning_rate": 7.183011937345271e-05,
"loss": 0.7269,
"step": 389
},
{
"epoch": 0.2033633163863903,
"grad_norm": 1.3146663904190063,
"learning_rate": 7.14503363531613e-05,
"loss": 0.8085,
"step": 390
},
{
"epoch": 0.20388476078738105,
"grad_norm": 1.3790757656097412,
"learning_rate": 7.106765275542055e-05,
"loss": 0.7678,
"step": 391
},
{
"epoch": 0.2044062051883718,
"grad_norm": 1.2263914346694946,
"learning_rate": 7.068211054579944e-05,
"loss": 0.8242,
"step": 392
},
{
"epoch": 0.20492764958936252,
"grad_norm": 1.492631196975708,
"learning_rate": 7.029375200334588e-05,
"loss": 0.9098,
"step": 393
},
{
"epoch": 0.20544909399035327,
"grad_norm": 1.5877786874771118,
"learning_rate": 6.99026197159505e-05,
"loss": 0.9134,
"step": 394
},
{
"epoch": 0.20597053839134402,
"grad_norm": 1.2312871217727661,
"learning_rate": 6.950875657567623e-05,
"loss": 0.7654,
"step": 395
},
{
"epoch": 0.20649198279233477,
"grad_norm": 1.3052772283554077,
"learning_rate": 6.911220577405484e-05,
"loss": 0.7572,
"step": 396
},
{
"epoch": 0.20701342719332552,
"grad_norm": 1.5820766687393188,
"learning_rate": 6.87130107973505e-05,
"loss": 0.9036,
"step": 397
},
{
"epoch": 0.20753487159431624,
"grad_norm": 1.7080368995666504,
"learning_rate": 6.831121542179087e-05,
"loss": 0.8461,
"step": 398
},
{
"epoch": 0.208056315995307,
"grad_norm": 1.7430877685546875,
"learning_rate": 6.790686370876671e-05,
"loss": 0.8611,
"step": 399
},
{
"epoch": 0.20857776039629775,
"grad_norm": 1.6613725423812866,
"learning_rate": 6.75e-05,
"loss": 0.9955,
"step": 400
},
{
"epoch": 0.2090992047972885,
"grad_norm": 1.153273105621338,
"learning_rate": 6.709066891268135e-05,
"loss": 0.3654,
"step": 401
},
{
"epoch": 0.20962064919827925,
"grad_norm": 2.4963393211364746,
"learning_rate": 6.667891533457719e-05,
"loss": 0.508,
"step": 402
},
{
"epoch": 0.21014209359926997,
"grad_norm": 0.5108705759048462,
"learning_rate": 6.626478441910744e-05,
"loss": 0.2177,
"step": 403
},
{
"epoch": 0.21066353800026072,
"grad_norm": 0.3899039328098297,
"learning_rate": 6.584832158039378e-05,
"loss": 0.1517,
"step": 404
},
{
"epoch": 0.21118498240125147,
"grad_norm": 0.6260213851928711,
"learning_rate": 6.542957248827961e-05,
"loss": 0.289,
"step": 405
},
{
"epoch": 0.21170642680224222,
"grad_norm": 0.6745234727859497,
"learning_rate": 6.500858306332174e-05,
"loss": 0.3188,
"step": 406
},
{
"epoch": 0.21222787120323294,
"grad_norm": 0.6891493797302246,
"learning_rate": 6.458539947175475e-05,
"loss": 0.3576,
"step": 407
},
{
"epoch": 0.2127493156042237,
"grad_norm": 0.7363607287406921,
"learning_rate": 6.416006812042828e-05,
"loss": 0.45,
"step": 408
},
{
"epoch": 0.21327076000521444,
"grad_norm": 0.8903110027313232,
"learning_rate": 6.373263565171806e-05,
"loss": 0.5986,
"step": 409
},
{
"epoch": 0.2137922044062052,
"grad_norm": 0.8679940700531006,
"learning_rate": 6.330314893841102e-05,
"loss": 0.5433,
"step": 410
},
{
"epoch": 0.21431364880719594,
"grad_norm": 0.959732711315155,
"learning_rate": 6.287165507856512e-05,
"loss": 0.5715,
"step": 411
},
{
"epoch": 0.21483509320818667,
"grad_norm": 1.0801646709442139,
"learning_rate": 6.243820139034464e-05,
"loss": 0.6556,
"step": 412
},
{
"epoch": 0.21535653760917742,
"grad_norm": 1.0459177494049072,
"learning_rate": 6.200283540683103e-05,
"loss": 0.6967,
"step": 413
},
{
"epoch": 0.21587798201016817,
"grad_norm": 1.250126600265503,
"learning_rate": 6.156560487081051e-05,
"loss": 0.6739,
"step": 414
},
{
"epoch": 0.21639942641115892,
"grad_norm": 1.2173274755477905,
"learning_rate": 6.112655772953851e-05,
"loss": 0.7337,
"step": 415
},
{
"epoch": 0.21692087081214964,
"grad_norm": 1.1681318283081055,
"learning_rate": 6.068574212948169e-05,
"loss": 0.7496,
"step": 416
},
{
"epoch": 0.2174423152131404,
"grad_norm": 1.147112488746643,
"learning_rate": 6.024320641103812e-05,
"loss": 0.7061,
"step": 417
},
{
"epoch": 0.21796375961413114,
"grad_norm": 0.9065099358558655,
"learning_rate": 5.979899910323625e-05,
"loss": 0.5293,
"step": 418
},
{
"epoch": 0.2184852040151219,
"grad_norm": 1.4341965913772583,
"learning_rate": 5.935316891841316e-05,
"loss": 1.0006,
"step": 419
},
{
"epoch": 0.21900664841611264,
"grad_norm": 1.0058789253234863,
"learning_rate": 5.890576474687264e-05,
"loss": 0.6819,
"step": 420
},
{
"epoch": 0.21952809281710337,
"grad_norm": 1.3122864961624146,
"learning_rate": 5.845683565152391e-05,
"loss": 0.8433,
"step": 421
},
{
"epoch": 0.22004953721809412,
"grad_norm": 1.0274057388305664,
"learning_rate": 5.800643086250122e-05,
"loss": 0.702,
"step": 422
},
{
"epoch": 0.22057098161908487,
"grad_norm": 1.1476062536239624,
"learning_rate": 5.7554599771765325e-05,
"loss": 0.8239,
"step": 423
},
{
"epoch": 0.22109242602007562,
"grad_norm": 1.1423624753952026,
"learning_rate": 5.710139192768696e-05,
"loss": 0.5831,
"step": 424
},
{
"epoch": 0.22161387042106637,
"grad_norm": 1.2858572006225586,
"learning_rate": 5.6646857029613434e-05,
"loss": 0.8175,
"step": 425
},
{
"epoch": 0.2221353148220571,
"grad_norm": 1.2642686367034912,
"learning_rate": 5.6191044922418485e-05,
"loss": 0.6987,
"step": 426
},
{
"epoch": 0.22265675922304784,
"grad_norm": 1.0850262641906738,
"learning_rate": 5.5734005591036144e-05,
"loss": 0.7008,
"step": 427
},
{
"epoch": 0.2231782036240386,
"grad_norm": 1.059691309928894,
"learning_rate": 5.527578915497952e-05,
"loss": 0.6485,
"step": 428
},
{
"epoch": 0.22369964802502934,
"grad_norm": 1.1776199340820312,
"learning_rate": 5.4816445862844426e-05,
"loss": 0.7816,
"step": 429
},
{
"epoch": 0.22422109242602006,
"grad_norm": 1.065766453742981,
"learning_rate": 5.435602608679916e-05,
"loss": 0.6872,
"step": 430
},
{
"epoch": 0.2247425368270108,
"grad_norm": 1.419598937034607,
"learning_rate": 5.3894580317060684e-05,
"loss": 0.7295,
"step": 431
},
{
"epoch": 0.22526398122800156,
"grad_norm": 1.0950857400894165,
"learning_rate": 5.343215915635762e-05,
"loss": 0.7141,
"step": 432
},
{
"epoch": 0.22578542562899231,
"grad_norm": 1.2764692306518555,
"learning_rate": 5.2968813314381255e-05,
"loss": 0.7478,
"step": 433
},
{
"epoch": 0.22630687002998306,
"grad_norm": 1.3446756601333618,
"learning_rate": 5.250459360222461e-05,
"loss": 0.7216,
"step": 434
},
{
"epoch": 0.2268283144309738,
"grad_norm": 1.3674976825714111,
"learning_rate": 5.20395509268104e-05,
"loss": 0.7942,
"step": 435
},
{
"epoch": 0.22734975883196454,
"grad_norm": 1.1008920669555664,
"learning_rate": 5.157373628530853e-05,
"loss": 0.6696,
"step": 436
},
{
"epoch": 0.2278712032329553,
"grad_norm": 1.119428277015686,
"learning_rate": 5.1107200759543704e-05,
"loss": 0.6747,
"step": 437
},
{
"epoch": 0.22839264763394604,
"grad_norm": 1.2905117273330688,
"learning_rate": 5.06399955103937e-05,
"loss": 0.7237,
"step": 438
},
{
"epoch": 0.22891409203493676,
"grad_norm": 1.1231979131698608,
"learning_rate": 5.017217177217901e-05,
"loss": 0.6448,
"step": 439
},
{
"epoch": 0.2294355364359275,
"grad_norm": 1.430540680885315,
"learning_rate": 4.9703780847044415e-05,
"loss": 0.9266,
"step": 440
},
{
"epoch": 0.22995698083691826,
"grad_norm": 1.6048601865768433,
"learning_rate": 4.923487409933316e-05,
"loss": 0.9436,
"step": 441
},
{
"epoch": 0.230478425237909,
"grad_norm": 1.4154115915298462,
"learning_rate": 4.876550294995421e-05,
"loss": 0.8578,
"step": 442
},
{
"epoch": 0.23099986963889976,
"grad_norm": 1.4124436378479004,
"learning_rate": 4.829571887074343e-05,
"loss": 0.811,
"step": 443
},
{
"epoch": 0.23152131403989049,
"grad_norm": 1.3669768571853638,
"learning_rate": 4.782557337881911e-05,
"loss": 0.8344,
"step": 444
},
{
"epoch": 0.23204275844088124,
"grad_norm": 1.5078638792037964,
"learning_rate": 4.7355118030932484e-05,
"loss": 0.7743,
"step": 445
},
{
"epoch": 0.232564202841872,
"grad_norm": 1.2406189441680908,
"learning_rate": 4.688440441781398e-05,
"loss": 0.7794,
"step": 446
},
{
"epoch": 0.23308564724286274,
"grad_norm": 1.4059579372406006,
"learning_rate": 4.6413484158515774e-05,
"loss": 0.9038,
"step": 447
},
{
"epoch": 0.2336070916438535,
"grad_norm": 1.6239300966262817,
"learning_rate": 4.594240889475107e-05,
"loss": 0.8264,
"step": 448
},
{
"epoch": 0.2341285360448442,
"grad_norm": 1.3349806070327759,
"learning_rate": 4.547123028523106e-05,
"loss": 0.784,
"step": 449
},
{
"epoch": 0.23464998044583496,
"grad_norm": 1.809417963027954,
"learning_rate": 4.5e-05,
"loss": 0.9593,
"step": 450
},
{
"epoch": 0.23464998044583496,
"eval_loss": 0.6834670901298523,
"eval_runtime": 326.3076,
"eval_samples_per_second": 19.797,
"eval_steps_per_second": 4.949,
"step": 450
},
{
"epoch": 0.2351714248468257,
"grad_norm": 0.8945098519325256,
"learning_rate": 4.452876971476896e-05,
"loss": 0.327,
"step": 451
},
{
"epoch": 0.23569286924781646,
"grad_norm": 0.9733836054801941,
"learning_rate": 4.4057591105248945e-05,
"loss": 0.3517,
"step": 452
},
{
"epoch": 0.23621431364880718,
"grad_norm": 0.44081079959869385,
"learning_rate": 4.358651584148423e-05,
"loss": 0.2293,
"step": 453
},
{
"epoch": 0.23673575804979793,
"grad_norm": 0.420837938785553,
"learning_rate": 4.311559558218603e-05,
"loss": 0.2186,
"step": 454
},
{
"epoch": 0.23725720245078868,
"grad_norm": 0.7009119391441345,
"learning_rate": 4.264488196906753e-05,
"loss": 0.3505,
"step": 455
},
{
"epoch": 0.23777864685177943,
"grad_norm": 0.7264213562011719,
"learning_rate": 4.21744266211809e-05,
"loss": 0.3351,
"step": 456
},
{
"epoch": 0.23830009125277019,
"grad_norm": 0.7642529606819153,
"learning_rate": 4.1704281129256585e-05,
"loss": 0.4276,
"step": 457
},
{
"epoch": 0.2388215356537609,
"grad_norm": 0.9208986759185791,
"learning_rate": 4.1234497050045815e-05,
"loss": 0.426,
"step": 458
},
{
"epoch": 0.23934298005475166,
"grad_norm": 1.1265970468521118,
"learning_rate": 4.076512590066686e-05,
"loss": 0.6691,
"step": 459
},
{
"epoch": 0.2398644244557424,
"grad_norm": 0.976740300655365,
"learning_rate": 4.0296219152955604e-05,
"loss": 0.6463,
"step": 460
},
{
"epoch": 0.24038586885673316,
"grad_norm": 0.9354336261749268,
"learning_rate": 3.982782822782101e-05,
"loss": 0.5267,
"step": 461
},
{
"epoch": 0.24090731325772388,
"grad_norm": 0.9918802380561829,
"learning_rate": 3.936000448960631e-05,
"loss": 0.5501,
"step": 462
},
{
"epoch": 0.24142875765871463,
"grad_norm": 1.246860384941101,
"learning_rate": 3.889279924045631e-05,
"loss": 0.6748,
"step": 463
},
{
"epoch": 0.24195020205970538,
"grad_norm": 0.8496841788291931,
"learning_rate": 3.842626371469148e-05,
"loss": 0.538,
"step": 464
},
{
"epoch": 0.24247164646069613,
"grad_norm": 1.2216079235076904,
"learning_rate": 3.796044907318961e-05,
"loss": 0.7916,
"step": 465
},
{
"epoch": 0.24299309086168688,
"grad_norm": 1.1502059698104858,
"learning_rate": 3.74954063977754e-05,
"loss": 0.5625,
"step": 466
},
{
"epoch": 0.2435145352626776,
"grad_norm": 1.0070570707321167,
"learning_rate": 3.703118668561876e-05,
"loss": 0.623,
"step": 467
},
{
"epoch": 0.24403597966366836,
"grad_norm": 1.0661598443984985,
"learning_rate": 3.6567840843642385e-05,
"loss": 0.7055,
"step": 468
},
{
"epoch": 0.2445574240646591,
"grad_norm": 1.037097692489624,
"learning_rate": 3.610541968293932e-05,
"loss": 0.6177,
"step": 469
},
{
"epoch": 0.24507886846564986,
"grad_norm": 1.8507579565048218,
"learning_rate": 3.564397391320084e-05,
"loss": 0.7,
"step": 470
},
{
"epoch": 0.2456003128666406,
"grad_norm": 1.1853182315826416,
"learning_rate": 3.51835541371556e-05,
"loss": 0.6217,
"step": 471
},
{
"epoch": 0.24612175726763133,
"grad_norm": 0.8545302152633667,
"learning_rate": 3.472421084502049e-05,
"loss": 0.5726,
"step": 472
},
{
"epoch": 0.24664320166862208,
"grad_norm": 1.1831412315368652,
"learning_rate": 3.426599440896387e-05,
"loss": 0.6007,
"step": 473
},
{
"epoch": 0.24716464606961283,
"grad_norm": 1.1138157844543457,
"learning_rate": 3.380895507758154e-05,
"loss": 0.6453,
"step": 474
},
{
"epoch": 0.24768609047060358,
"grad_norm": 1.097508192062378,
"learning_rate": 3.3353142970386565e-05,
"loss": 0.6088,
"step": 475
},
{
"epoch": 0.2482075348715943,
"grad_norm": 1.123647928237915,
"learning_rate": 3.2898608072313045e-05,
"loss": 0.7489,
"step": 476
},
{
"epoch": 0.24872897927258505,
"grad_norm": 1.0547268390655518,
"learning_rate": 3.244540022823469e-05,
"loss": 0.6683,
"step": 477
},
{
"epoch": 0.2492504236735758,
"grad_norm": 0.9712570905685425,
"learning_rate": 3.199356913749877e-05,
"loss": 0.5591,
"step": 478
},
{
"epoch": 0.24977186807456656,
"grad_norm": 1.165372371673584,
"learning_rate": 3.1543164348476105e-05,
"loss": 0.6808,
"step": 479
},
{
"epoch": 0.2502933124755573,
"grad_norm": 1.4066596031188965,
"learning_rate": 3.1094235253127374e-05,
"loss": 0.7196,
"step": 480
},
{
"epoch": 0.25081475687654803,
"grad_norm": 1.2690683603286743,
"learning_rate": 3.064683108158685e-05,
"loss": 0.7631,
"step": 481
},
{
"epoch": 0.2513362012775388,
"grad_norm": 1.2126400470733643,
"learning_rate": 3.0201000896763757e-05,
"loss": 0.6118,
"step": 482
},
{
"epoch": 0.25185764567852953,
"grad_norm": 1.2217490673065186,
"learning_rate": 2.975679358896189e-05,
"loss": 0.8022,
"step": 483
},
{
"epoch": 0.25237909007952025,
"grad_norm": 1.4065697193145752,
"learning_rate": 2.9314257870518325e-05,
"loss": 0.8177,
"step": 484
},
{
"epoch": 0.25290053448051103,
"grad_norm": 1.394194483757019,
"learning_rate": 2.887344227046149e-05,
"loss": 0.6846,
"step": 485
},
{
"epoch": 0.25342197888150175,
"grad_norm": 1.2853827476501465,
"learning_rate": 2.8434395129189495e-05,
"loss": 0.8623,
"step": 486
},
{
"epoch": 0.25394342328249253,
"grad_norm": 1.3407214879989624,
"learning_rate": 2.7997164593168986e-05,
"loss": 0.8026,
"step": 487
},
{
"epoch": 0.25446486768348325,
"grad_norm": 0.9608036875724792,
"learning_rate": 2.756179860965537e-05,
"loss": 0.5896,
"step": 488
},
{
"epoch": 0.254986312084474,
"grad_norm": 1.2732912302017212,
"learning_rate": 2.7128344921434877e-05,
"loss": 0.882,
"step": 489
},
{
"epoch": 0.25550775648546475,
"grad_norm": 1.3587908744812012,
"learning_rate": 2.6696851061589e-05,
"loss": 0.7432,
"step": 490
},
{
"epoch": 0.2560292008864555,
"grad_norm": 1.1746113300323486,
"learning_rate": 2.6267364348281954e-05,
"loss": 0.7805,
"step": 491
},
{
"epoch": 0.2565506452874462,
"grad_norm": 1.1895116567611694,
"learning_rate": 2.5839931879571733e-05,
"loss": 0.8167,
"step": 492
},
{
"epoch": 0.257072089688437,
"grad_norm": 1.246069312095642,
"learning_rate": 2.541460052824527e-05,
"loss": 0.7614,
"step": 493
},
{
"epoch": 0.2575935340894277,
"grad_norm": 1.507230281829834,
"learning_rate": 2.4991416936678276e-05,
"loss": 0.7661,
"step": 494
},
{
"epoch": 0.2581149784904185,
"grad_norm": 1.2582144737243652,
"learning_rate": 2.4570427511720398e-05,
"loss": 0.7222,
"step": 495
},
{
"epoch": 0.2586364228914092,
"grad_norm": 1.2553263902664185,
"learning_rate": 2.4151678419606235e-05,
"loss": 0.8181,
"step": 496
},
{
"epoch": 0.2591578672923999,
"grad_norm": 1.2473095655441284,
"learning_rate": 2.3735215580892577e-05,
"loss": 0.7124,
"step": 497
},
{
"epoch": 0.2596793116933907,
"grad_norm": 1.3642276525497437,
"learning_rate": 2.3321084665422807e-05,
"loss": 0.8353,
"step": 498
},
{
"epoch": 0.2602007560943814,
"grad_norm": 1.269373893737793,
"learning_rate": 2.2909331087318664e-05,
"loss": 0.73,
"step": 499
},
{
"epoch": 0.2607222004953722,
"grad_norm": 1.5897523164749146,
"learning_rate": 2.250000000000001e-05,
"loss": 0.8818,
"step": 500
},
{
"epoch": 0.2612436448963629,
"grad_norm": 0.7522194981575012,
"learning_rate": 2.209313629123329e-05,
"loss": 0.3334,
"step": 501
},
{
"epoch": 0.26176508929735365,
"grad_norm": 1.1364073753356934,
"learning_rate": 2.168878457820915e-05,
"loss": 0.4722,
"step": 502
},
{
"epoch": 0.2622865336983444,
"grad_norm": 0.45651566982269287,
"learning_rate": 2.128698920264951e-05,
"loss": 0.2023,
"step": 503
},
{
"epoch": 0.26280797809933515,
"grad_norm": 0.407569944858551,
"learning_rate": 2.088779422594514e-05,
"loss": 0.2021,
"step": 504
},
{
"epoch": 0.2633294225003259,
"grad_norm": 0.6987316608428955,
"learning_rate": 2.0491243424323783e-05,
"loss": 0.2595,
"step": 505
},
{
"epoch": 0.26385086690131665,
"grad_norm": 0.741063117980957,
"learning_rate": 2.009738028404952e-05,
"loss": 0.3919,
"step": 506
},
{
"epoch": 0.2643723113023074,
"grad_norm": 0.7104949951171875,
"learning_rate": 1.9706247996654134e-05,
"loss": 0.3903,
"step": 507
},
{
"epoch": 0.26489375570329815,
"grad_norm": 1.1556988954544067,
"learning_rate": 1.9317889454200578e-05,
"loss": 0.538,
"step": 508
},
{
"epoch": 0.2654152001042889,
"grad_norm": 0.873782753944397,
"learning_rate": 1.8932347244579463e-05,
"loss": 0.5209,
"step": 509
},
{
"epoch": 0.26593664450527965,
"grad_norm": 0.6760383248329163,
"learning_rate": 1.8549663646838714e-05,
"loss": 0.4616,
"step": 510
},
{
"epoch": 0.2664580889062704,
"grad_norm": 0.7786940336227417,
"learning_rate": 1.8169880626547285e-05,
"loss": 0.4068,
"step": 511
},
{
"epoch": 0.2669795333072611,
"grad_norm": 0.9264464378356934,
"learning_rate": 1.7793039831193134e-05,
"loss": 0.599,
"step": 512
},
{
"epoch": 0.2675009777082519,
"grad_norm": 0.9444701671600342,
"learning_rate": 1.741918258561607e-05,
"loss": 0.6268,
"step": 513
},
{
"epoch": 0.2680224221092426,
"grad_norm": 1.0351696014404297,
"learning_rate": 1.7048349887476038e-05,
"loss": 0.7387,
"step": 514
},
{
"epoch": 0.2685438665102333,
"grad_norm": 1.108831524848938,
"learning_rate": 1.6680582402757324e-05,
"loss": 0.597,
"step": 515
},
{
"epoch": 0.2690653109112241,
"grad_norm": 0.933988094329834,
"learning_rate": 1.631592046130896e-05,
"loss": 0.6301,
"step": 516
},
{
"epoch": 0.2695867553122148,
"grad_norm": 0.9654362797737122,
"learning_rate": 1.5954404052422217e-05,
"loss": 0.52,
"step": 517
},
{
"epoch": 0.2701081997132056,
"grad_norm": 1.1467019319534302,
"learning_rate": 1.5596072820445255e-05,
"loss": 0.7317,
"step": 518
},
{
"epoch": 0.2706296441141963,
"grad_norm": 1.1809656620025635,
"learning_rate": 1.5240966060435674e-05,
"loss": 0.6836,
"step": 519
},
{
"epoch": 0.27115108851518704,
"grad_norm": 0.8975329995155334,
"learning_rate": 1.4889122713851395e-05,
"loss": 0.5057,
"step": 520
},
{
"epoch": 0.2716725329161778,
"grad_norm": 0.9241394996643066,
"learning_rate": 1.4540581364280274e-05,
"loss": 0.6661,
"step": 521
},
{
"epoch": 0.27219397731716855,
"grad_norm": 0.7762001156806946,
"learning_rate": 1.4195380233209009e-05,
"loss": 0.4453,
"step": 522
},
{
"epoch": 0.2727154217181593,
"grad_norm": 1.1368845701217651,
"learning_rate": 1.38535571758317e-05,
"loss": 0.8007,
"step": 523
},
{
"epoch": 0.27323686611915005,
"grad_norm": 1.1444828510284424,
"learning_rate": 1.3515149676898551e-05,
"loss": 0.6431,
"step": 524
},
{
"epoch": 0.27375831052014077,
"grad_norm": 1.3276859521865845,
"learning_rate": 1.3180194846605365e-05,
"loss": 0.7336,
"step": 525
},
{
"epoch": 0.27427975492113155,
"grad_norm": 0.9438497424125671,
"learning_rate": 1.284872941652386e-05,
"loss": 0.6664,
"step": 526
},
{
"epoch": 0.27480119932212227,
"grad_norm": 1.2121400833129883,
"learning_rate": 1.2520789735573703e-05,
"loss": 0.7121,
"step": 527
},
{
"epoch": 0.27532264372311305,
"grad_norm": 1.2498886585235596,
"learning_rate": 1.2196411766036491e-05,
"loss": 0.7712,
"step": 528
},
{
"epoch": 0.27584408812410377,
"grad_norm": 1.1315795183181763,
"learning_rate": 1.1875631079611956e-05,
"loss": 0.7224,
"step": 529
},
{
"epoch": 0.2763655325250945,
"grad_norm": 1.4224852323532104,
"learning_rate": 1.1558482853517254e-05,
"loss": 0.6649,
"step": 530
},
{
"epoch": 0.27688697692608527,
"grad_norm": 1.2838762998580933,
"learning_rate": 1.124500186662932e-05,
"loss": 0.7847,
"step": 531
},
{
"epoch": 0.277408421327076,
"grad_norm": 1.1303495168685913,
"learning_rate": 1.0935222495670969e-05,
"loss": 0.7652,
"step": 532
},
{
"epoch": 0.27792986572806677,
"grad_norm": 1.0751543045043945,
"learning_rate": 1.0629178711441115e-05,
"loss": 0.6492,
"step": 533
},
{
"epoch": 0.2784513101290575,
"grad_norm": 1.2806495428085327,
"learning_rate": 1.032690407508949e-05,
"loss": 0.6801,
"step": 534
},
{
"epoch": 0.2789727545300482,
"grad_norm": 1.032645583152771,
"learning_rate": 1.002843173443631e-05,
"loss": 0.6324,
"step": 535
},
{
"epoch": 0.279494198931039,
"grad_norm": 1.1595271825790405,
"learning_rate": 9.733794420337214e-06,
"loss": 0.7248,
"step": 536
},
{
"epoch": 0.2800156433320297,
"grad_norm": 1.2283949851989746,
"learning_rate": 9.443024443093932e-06,
"loss": 0.6415,
"step": 537
},
{
"epoch": 0.28053708773302044,
"grad_norm": 1.0101823806762695,
"learning_rate": 9.15615368891117e-06,
"loss": 0.7115,
"step": 538
},
{
"epoch": 0.2810585321340112,
"grad_norm": 1.1437267065048218,
"learning_rate": 8.873213616399854e-06,
"loss": 0.8146,
"step": 539
},
{
"epoch": 0.28157997653500194,
"grad_norm": 1.395314335823059,
"learning_rate": 8.59423525312737e-06,
"loss": 0.8216,
"step": 540
},
{
"epoch": 0.2821014209359927,
"grad_norm": 1.1216462850570679,
"learning_rate": 8.319249192215056e-06,
"loss": 0.7296,
"step": 541
},
{
"epoch": 0.28262286533698344,
"grad_norm": 1.1411585807800293,
"learning_rate": 8.04828558898332e-06,
"loss": 0.7305,
"step": 542
},
{
"epoch": 0.28314430973797416,
"grad_norm": 1.0510220527648926,
"learning_rate": 7.781374157644714e-06,
"loss": 0.7393,
"step": 543
},
{
"epoch": 0.28366575413896494,
"grad_norm": 1.3976047039031982,
"learning_rate": 7.518544168045526e-06,
"loss": 0.8331,
"step": 544
},
{
"epoch": 0.28418719853995567,
"grad_norm": 1.1391674280166626,
"learning_rate": 7.259824442455923e-06,
"loss": 0.7816,
"step": 545
},
{
"epoch": 0.28470864294094644,
"grad_norm": 1.2341560125350952,
"learning_rate": 7.005243352409332e-06,
"loss": 0.7965,
"step": 546
},
{
"epoch": 0.28523008734193717,
"grad_norm": 1.2359131574630737,
"learning_rate": 6.754828815591131e-06,
"loss": 0.8758,
"step": 547
},
{
"epoch": 0.2857515317429279,
"grad_norm": 1.5217698812484741,
"learning_rate": 6.508608292777203e-06,
"loss": 0.9667,
"step": 548
},
{
"epoch": 0.28627297614391867,
"grad_norm": 1.633954405784607,
"learning_rate": 6.266608784822542e-06,
"loss": 0.7868,
"step": 549
},
{
"epoch": 0.2867944205449094,
"grad_norm": 1.9118640422821045,
"learning_rate": 6.028856829700258e-06,
"loss": 0.8767,
"step": 550
},
{
"epoch": 0.28731586494590017,
"grad_norm": 0.5335854291915894,
"learning_rate": 5.795378499591479e-06,
"loss": 0.2677,
"step": 551
},
{
"epoch": 0.2878373093468909,
"grad_norm": 0.8315818309783936,
"learning_rate": 5.566199398026149e-06,
"loss": 0.3736,
"step": 552
},
{
"epoch": 0.2883587537478816,
"grad_norm": 0.4964602589607239,
"learning_rate": 5.341344657075353e-06,
"loss": 0.1941,
"step": 553
},
{
"epoch": 0.2888801981488724,
"grad_norm": 0.4502871632575989,
"learning_rate": 5.120838934595337e-06,
"loss": 0.2058,
"step": 554
},
{
"epoch": 0.2894016425498631,
"grad_norm": 0.6107041239738464,
"learning_rate": 4.90470641152345e-06,
"loss": 0.2863,
"step": 555
},
{
"epoch": 0.2899230869508539,
"grad_norm": 0.5411296486854553,
"learning_rate": 4.69297078922642e-06,
"loss": 0.3224,
"step": 556
},
{
"epoch": 0.2904445313518446,
"grad_norm": 0.5916683673858643,
"learning_rate": 4.485655286901292e-06,
"loss": 0.3624,
"step": 557
},
{
"epoch": 0.29096597575283534,
"grad_norm": 0.9298704266548157,
"learning_rate": 4.28278263902913e-06,
"loss": 0.5355,
"step": 558
},
{
"epoch": 0.2914874201538261,
"grad_norm": 1.085946798324585,
"learning_rate": 4.084375092881916e-06,
"loss": 0.5446,
"step": 559
},
{
"epoch": 0.29200886455481684,
"grad_norm": 0.9874389171600342,
"learning_rate": 3.890454406082956e-06,
"loss": 0.6942,
"step": 560
},
{
"epoch": 0.29253030895580756,
"grad_norm": 0.7588855028152466,
"learning_rate": 3.701041844220849e-06,
"loss": 0.5185,
"step": 561
},
{
"epoch": 0.29305175335679834,
"grad_norm": 0.7749528884887695,
"learning_rate": 3.516158178517482e-06,
"loss": 0.4994,
"step": 562
},
{
"epoch": 0.29357319775778906,
"grad_norm": 0.9377657175064087,
"learning_rate": 3.335823683550237e-06,
"loss": 0.5773,
"step": 563
},
{
"epoch": 0.29409464215877984,
"grad_norm": 0.9080403447151184,
"learning_rate": 3.1600581350286897e-06,
"loss": 0.5582,
"step": 564
},
{
"epoch": 0.29461608655977056,
"grad_norm": 0.9245966672897339,
"learning_rate": 2.9888808076259267e-06,
"loss": 0.6085,
"step": 565
},
{
"epoch": 0.2951375309607613,
"grad_norm": 1.0248568058013916,
"learning_rate": 2.822310472864885e-06,
"loss": 0.6125,
"step": 566
},
{
"epoch": 0.29565897536175206,
"grad_norm": 1.0454648733139038,
"learning_rate": 2.660365397059855e-06,
"loss": 0.6444,
"step": 567
},
{
"epoch": 0.2961804197627428,
"grad_norm": 0.9811504483222961,
"learning_rate": 2.503063339313355e-06,
"loss": 0.647,
"step": 568
},
{
"epoch": 0.29670186416373356,
"grad_norm": 1.1872072219848633,
"learning_rate": 2.3504215495686498e-06,
"loss": 0.7537,
"step": 569
},
{
"epoch": 0.2972233085647243,
"grad_norm": 0.9496892094612122,
"learning_rate": 2.2024567667180914e-06,
"loss": 0.6789,
"step": 570
},
{
"epoch": 0.297744752965715,
"grad_norm": 1.110507607460022,
"learning_rate": 2.059185216767543e-06,
"loss": 0.64,
"step": 571
},
{
"epoch": 0.2982661973667058,
"grad_norm": 0.9525758028030396,
"learning_rate": 1.9206226110569742e-06,
"loss": 0.5955,
"step": 572
},
{
"epoch": 0.2987876417676965,
"grad_norm": 1.0465561151504517,
"learning_rate": 1.7867841445375621e-06,
"loss": 0.6887,
"step": 573
},
{
"epoch": 0.2993090861686873,
"grad_norm": 1.0657908916473389,
"learning_rate": 1.6576844941053854e-06,
"loss": 0.7477,
"step": 574
},
{
"epoch": 0.299830530569678,
"grad_norm": 1.0674349069595337,
"learning_rate": 1.533337816991931e-06,
"loss": 0.7795,
"step": 575
},
{
"epoch": 0.30035197497066873,
"grad_norm": 1.0765479803085327,
"learning_rate": 1.4137577492116016e-06,
"loss": 0.7111,
"step": 576
},
{
"epoch": 0.3008734193716595,
"grad_norm": 1.0293859243392944,
"learning_rate": 1.2989574040663816e-06,
"loss": 0.6233,
"step": 577
},
{
"epoch": 0.30139486377265023,
"grad_norm": 1.1408146619796753,
"learning_rate": 1.188949370707787e-06,
"loss": 0.6778,
"step": 578
},
{
"epoch": 0.301916308173641,
"grad_norm": 1.2201671600341797,
"learning_rate": 1.0837457127563656e-06,
"loss": 0.7382,
"step": 579
},
{
"epoch": 0.30243775257463174,
"grad_norm": 1.1659302711486816,
"learning_rate": 9.83357966978744e-07,
"loss": 0.7453,
"step": 580
},
{
"epoch": 0.30295919697562246,
"grad_norm": 1.2378828525543213,
"learning_rate": 8.877971420225212e-07,
"loss": 0.8273,
"step": 581
},
{
"epoch": 0.30348064137661324,
"grad_norm": 1.0689337253570557,
"learning_rate": 7.970737172090126e-07,
"loss": 0.6794,
"step": 582
},
{
"epoch": 0.30400208577760396,
"grad_norm": 1.2321866750717163,
"learning_rate": 7.111976413841153e-07,
"loss": 0.7465,
"step": 583
},
{
"epoch": 0.3045235301785947,
"grad_norm": 1.0835295915603638,
"learning_rate": 6.301783318272809e-07,
"loss": 0.6639,
"step": 584
},
{
"epoch": 0.30504497457958546,
"grad_norm": 1.1415941715240479,
"learning_rate": 5.540246732188054e-07,
"loss": 0.626,
"step": 585
},
{
"epoch": 0.3055664189805762,
"grad_norm": 1.1108524799346924,
"learning_rate": 4.827450166655251e-07,
"loss": 0.7758,
"step": 586
},
{
"epoch": 0.30608786338156696,
"grad_norm": 1.0960720777511597,
"learning_rate": 4.1634717878503816e-07,
"loss": 0.7627,
"step": 587
},
{
"epoch": 0.3066093077825577,
"grad_norm": 1.2436336278915405,
"learning_rate": 3.548384408485006e-07,
"loss": 0.7669,
"step": 588
},
{
"epoch": 0.3071307521835484,
"grad_norm": 1.194196343421936,
"learning_rate": 2.9822554798215994e-07,
"loss": 0.7492,
"step": 589
},
{
"epoch": 0.3076521965845392,
"grad_norm": 1.3664387464523315,
"learning_rate": 2.4651470842770196e-07,
"loss": 0.7941,
"step": 590
},
{
"epoch": 0.3081736409855299,
"grad_norm": 1.2188045978546143,
"learning_rate": 1.9971159286140017e-07,
"loss": 0.7608,
"step": 591
},
{
"epoch": 0.3086950853865207,
"grad_norm": 1.096019983291626,
"learning_rate": 1.5782133377230334e-07,
"loss": 0.655,
"step": 592
},
{
"epoch": 0.3092165297875114,
"grad_norm": 1.2465800046920776,
"learning_rate": 1.208485248993857e-07,
"loss": 0.7535,
"step": 593
},
{
"epoch": 0.30973797418850213,
"grad_norm": 1.3181999921798706,
"learning_rate": 8.879722072777986e-08,
"loss": 0.8335,
"step": 594
},
{
"epoch": 0.3102594185894929,
"grad_norm": 1.1617658138275146,
"learning_rate": 6.167093604417751e-08,
"loss": 0.6741,
"step": 595
},
{
"epoch": 0.31078086299048363,
"grad_norm": 1.2442768812179565,
"learning_rate": 3.9472645551372757e-08,
"loss": 0.777,
"step": 596
},
{
"epoch": 0.3113023073914744,
"grad_norm": 2.248586416244507,
"learning_rate": 2.2204783542078e-08,
"loss": 0.6729,
"step": 597
},
{
"epoch": 0.31182375179246513,
"grad_norm": 1.264545202255249,
"learning_rate": 9.869243631952518e-09,
"loss": 0.7514,
"step": 598
},
{
"epoch": 0.31234519619345585,
"grad_norm": 1.1710125207901,
"learning_rate": 2.467378551953559e-09,
"loss": 0.7467,
"step": 599
},
{
"epoch": 0.31286664059444663,
"grad_norm": 1.261979103088379,
"learning_rate": 0.0,
"loss": 0.7243,
"step": 600
},
{
"epoch": 0.31286664059444663,
"eval_loss": 0.6509745717048645,
"eval_runtime": 326.4655,
"eval_samples_per_second": 19.788,
"eval_steps_per_second": 4.947,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8826249462748283e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}