KatunyouAI / trainer_state.json

Upload 11 files

5c8ce00 verified about 1 year ago

39.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 2.0,
	"eval_steps": 45,
	"global_step": 222,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.009009009009009009,
	"grad_norm": 3.5627665519714355,
	"learning_rate": 2e-05,
	"loss": 1.6727,
	"step": 1
	},
	{
	"epoch": 0.018018018018018018,
	"grad_norm": 4.157745361328125,
	"learning_rate": 4e-05,
	"loss": 1.9092,
	"step": 2
	},
	{
	"epoch": 0.02702702702702703,
	"grad_norm": 4.089487552642822,
	"learning_rate": 6e-05,
	"loss": 1.9415,
	"step": 3
	},
	{
	"epoch": 0.036036036036036036,
	"grad_norm": 3.6906230449676514,
	"learning_rate": 8e-05,
	"loss": 2.0175,
	"step": 4
	},
	{
	"epoch": 0.04504504504504504,
	"grad_norm": 4.181814670562744,
	"learning_rate": 0.0001,
	"loss": 2.4265,
	"step": 5
	},
	{
	"epoch": 0.05405405405405406,
	"grad_norm": 5.658498287200928,
	"learning_rate": 0.00012,
	"loss": 2.1897,
	"step": 6
	},
	{
	"epoch": 0.06306306306306306,
	"grad_norm": 6.143243312835693,
	"learning_rate": 0.00014,
	"loss": 1.9796,
	"step": 7
	},
	{
	"epoch": 0.07207207207207207,
	"grad_norm": 4.409506797790527,
	"learning_rate": 0.00016,
	"loss": 1.3284,
	"step": 8
	},
	{
	"epoch": 0.08108108108108109,
	"grad_norm": 4.532634258270264,
	"learning_rate": 0.00018,
	"loss": 1.9217,
	"step": 9
	},
	{
	"epoch": 0.09009009009009009,
	"grad_norm": 4.361605167388916,
	"learning_rate": 0.0002,
	"loss": 1.4035,
	"step": 10
	},
	{
	"epoch": 0.0990990990990991,
	"grad_norm": 4.356982707977295,
	"learning_rate": 0.0001990566037735849,
	"loss": 1.7363,
	"step": 11
	},
	{
	"epoch": 0.10810810810810811,
	"grad_norm": 5.593740940093994,
	"learning_rate": 0.00019811320754716983,
	"loss": 1.1967,
	"step": 12
	},
	{
	"epoch": 0.11711711711711711,
	"grad_norm": 4.971708297729492,
	"learning_rate": 0.00019716981132075472,
	"loss": 1.2498,
	"step": 13
	},
	{
	"epoch": 0.12612612612612611,
	"grad_norm": 4.279292106628418,
	"learning_rate": 0.00019622641509433963,
	"loss": 1.2859,
	"step": 14
	},
	{
	"epoch": 0.13513513513513514,
	"grad_norm": 6.121211051940918,
	"learning_rate": 0.00019528301886792454,
	"loss": 1.2168,
	"step": 15
	},
	{
	"epoch": 0.14414414414414414,
	"grad_norm": 5.0355377197265625,
	"learning_rate": 0.00019433962264150945,
	"loss": 1.1547,
	"step": 16
	},
	{
	"epoch": 0.15315315315315314,
	"grad_norm": 3.8945412635803223,
	"learning_rate": 0.00019339622641509433,
	"loss": 0.9262,
	"step": 17
	},
	{
	"epoch": 0.16216216216216217,
	"grad_norm": 3.1203081607818604,
	"learning_rate": 0.00019245283018867927,
	"loss": 0.7669,
	"step": 18
	},
	{
	"epoch": 0.17117117117117117,
	"grad_norm": 3.8455443382263184,
	"learning_rate": 0.00019150943396226415,
	"loss": 1.0904,
	"step": 19
	},
	{
	"epoch": 0.18018018018018017,
	"grad_norm": 3.4534966945648193,
	"learning_rate": 0.00019056603773584906,
	"loss": 0.7505,
	"step": 20
	},
	{
	"epoch": 0.1891891891891892,
	"grad_norm": 4.243616104125977,
	"learning_rate": 0.00018962264150943397,
	"loss": 1.3768,
	"step": 21
	},
	{
	"epoch": 0.1981981981981982,
	"grad_norm": 3.4573705196380615,
	"learning_rate": 0.00018867924528301889,
	"loss": 0.9779,
	"step": 22
	},
	{
	"epoch": 0.2072072072072072,
	"grad_norm": 5.369935512542725,
	"learning_rate": 0.00018773584905660377,
	"loss": 1.2219,
	"step": 23
	},
	{
	"epoch": 0.21621621621621623,
	"grad_norm": 3.7894225120544434,
	"learning_rate": 0.00018679245283018868,
	"loss": 1.0627,
	"step": 24
	},
	{
	"epoch": 0.22522522522522523,
	"grad_norm": 2.4575483798980713,
	"learning_rate": 0.0001858490566037736,
	"loss": 0.7111,
	"step": 25
	},
	{
	"epoch": 0.23423423423423423,
	"grad_norm": 3.8878819942474365,
	"learning_rate": 0.0001849056603773585,
	"loss": 1.0111,
	"step": 26
	},
	{
	"epoch": 0.24324324324324326,
	"grad_norm": 2.962925434112549,
	"learning_rate": 0.00018396226415094339,
	"loss": 0.9287,
	"step": 27
	},
	{
	"epoch": 0.25225225225225223,
	"grad_norm": 4.2727460861206055,
	"learning_rate": 0.00018301886792452832,
	"loss": 1.8179,
	"step": 28
	},
	{
	"epoch": 0.26126126126126126,
	"grad_norm": 2.872020721435547,
	"learning_rate": 0.0001820754716981132,
	"loss": 1.1652,
	"step": 29
	},
	{
	"epoch": 0.2702702702702703,
	"grad_norm": 2.749438524246216,
	"learning_rate": 0.00018113207547169812,
	"loss": 0.9771,
	"step": 30
	},
	{
	"epoch": 0.27927927927927926,
	"grad_norm": 2.3074300289154053,
	"learning_rate": 0.00018018867924528303,
	"loss": 0.9235,
	"step": 31
	},
	{
	"epoch": 0.2882882882882883,
	"grad_norm": 2.2534520626068115,
	"learning_rate": 0.00017924528301886794,
	"loss": 0.7866,
	"step": 32
	},
	{
	"epoch": 0.2972972972972973,
	"grad_norm": 2.3018388748168945,
	"learning_rate": 0.00017830188679245282,
	"loss": 1.0289,
	"step": 33
	},
	{
	"epoch": 0.3063063063063063,
	"grad_norm": 2.2766075134277344,
	"learning_rate": 0.00017735849056603776,
	"loss": 0.794,
	"step": 34
	},
	{
	"epoch": 0.3153153153153153,
	"grad_norm": 2.94279146194458,
	"learning_rate": 0.00017641509433962265,
	"loss": 1.0258,
	"step": 35
	},
	{
	"epoch": 0.32432432432432434,
	"grad_norm": 2.6279854774475098,
	"learning_rate": 0.00017547169811320756,
	"loss": 0.8012,
	"step": 36
	},
	{
	"epoch": 0.3333333333333333,
	"grad_norm": 2.9557905197143555,
	"learning_rate": 0.00017452830188679247,
	"loss": 0.9417,
	"step": 37
	},
	{
	"epoch": 0.34234234234234234,
	"grad_norm": 3.0805795192718506,
	"learning_rate": 0.00017358490566037738,
	"loss": 0.9646,
	"step": 38
	},
	{
	"epoch": 0.35135135135135137,
	"grad_norm": 2.9339115619659424,
	"learning_rate": 0.00017264150943396226,
	"loss": 0.7578,
	"step": 39
	},
	{
	"epoch": 0.36036036036036034,
	"grad_norm": 2.9891958236694336,
	"learning_rate": 0.00017169811320754717,
	"loss": 1.0922,
	"step": 40
	},
	{
	"epoch": 0.36936936936936937,
	"grad_norm": 3.38738751411438,
	"learning_rate": 0.00017075471698113208,
	"loss": 0.8558,
	"step": 41
	},
	{
	"epoch": 0.3783783783783784,
	"grad_norm": 4.405691623687744,
	"learning_rate": 0.000169811320754717,
	"loss": 1.2379,
	"step": 42
	},
	{
	"epoch": 0.38738738738738737,
	"grad_norm": 2.889787197113037,
	"learning_rate": 0.00016886792452830188,
	"loss": 0.7855,
	"step": 43
	},
	{
	"epoch": 0.3963963963963964,
	"grad_norm": 3.345532178878784,
	"learning_rate": 0.00016792452830188682,
	"loss": 1.1649,
	"step": 44
	},
	{
	"epoch": 0.40540540540540543,
	"grad_norm": 3.4532392024993896,
	"learning_rate": 0.0001669811320754717,
	"loss": 0.762,
	"step": 45
	},
	{
	"epoch": 0.40540540540540543,
	"eval_loss": 1.0260719060897827,
	"eval_runtime": 6.7753,
	"eval_samples_per_second": 3.69,
	"eval_steps_per_second": 3.69,
	"step": 45
	},
	{
	"epoch": 0.4144144144144144,
	"grad_norm": 3.782933473587036,
	"learning_rate": 0.0001660377358490566,
	"loss": 0.9724,
	"step": 46
	},
	{
	"epoch": 0.42342342342342343,
	"grad_norm": 3.206749677658081,
	"learning_rate": 0.00016509433962264152,
	"loss": 0.8519,
	"step": 47
	},
	{
	"epoch": 0.43243243243243246,
	"grad_norm": 3.8330488204956055,
	"learning_rate": 0.00016415094339622643,
	"loss": 1.1495,
	"step": 48
	},
	{
	"epoch": 0.44144144144144143,
	"grad_norm": 3.997997283935547,
	"learning_rate": 0.00016320754716981132,
	"loss": 1.0858,
	"step": 49
	},
	{
	"epoch": 0.45045045045045046,
	"grad_norm": 3.165234327316284,
	"learning_rate": 0.00016226415094339625,
	"loss": 0.5681,
	"step": 50
	},
	{
	"epoch": 0.4594594594594595,
	"grad_norm": 3.4459192752838135,
	"learning_rate": 0.00016132075471698114,
	"loss": 0.8332,
	"step": 51
	},
	{
	"epoch": 0.46846846846846846,
	"grad_norm": 2.606905698776245,
	"learning_rate": 0.00016037735849056605,
	"loss": 0.7737,
	"step": 52
	},
	{
	"epoch": 0.4774774774774775,
	"grad_norm": 4.07294225692749,
	"learning_rate": 0.00015943396226415096,
	"loss": 1.0516,
	"step": 53
	},
	{
	"epoch": 0.4864864864864865,
	"grad_norm": 6.001366138458252,
	"learning_rate": 0.00015849056603773587,
	"loss": 1.1525,
	"step": 54
	},
	{
	"epoch": 0.4954954954954955,
	"grad_norm": 2.798070192337036,
	"learning_rate": 0.00015754716981132075,
	"loss": 0.9808,
	"step": 55
	},
	{
	"epoch": 0.5045045045045045,
	"grad_norm": 3.14383864402771,
	"learning_rate": 0.00015660377358490567,
	"loss": 1.1579,
	"step": 56
	},
	{
	"epoch": 0.5135135135135135,
	"grad_norm": 4.082361221313477,
	"learning_rate": 0.00015566037735849058,
	"loss": 1.2818,
	"step": 57
	},
	{
	"epoch": 0.5225225225225225,
	"grad_norm": 2.872138261795044,
	"learning_rate": 0.0001547169811320755,
	"loss": 0.916,
	"step": 58
	},
	{
	"epoch": 0.5315315315315315,
	"grad_norm": 2.5635435581207275,
	"learning_rate": 0.00015377358490566037,
	"loss": 0.9273,
	"step": 59
	},
	{
	"epoch": 0.5405405405405406,
	"grad_norm": 3.1948022842407227,
	"learning_rate": 0.0001528301886792453,
	"loss": 0.8251,
	"step": 60
	},
	{
	"epoch": 0.5495495495495496,
	"grad_norm": 2.7782211303710938,
	"learning_rate": 0.0001518867924528302,
	"loss": 0.9499,
	"step": 61
	},
	{
	"epoch": 0.5585585585585585,
	"grad_norm": 3.423865556716919,
	"learning_rate": 0.0001509433962264151,
	"loss": 0.7943,
	"step": 62
	},
	{
	"epoch": 0.5675675675675675,
	"grad_norm": 3.791781187057495,
	"learning_rate": 0.00015000000000000001,
	"loss": 1.2937,
	"step": 63
	},
	{
	"epoch": 0.5765765765765766,
	"grad_norm": 2.6040596961975098,
	"learning_rate": 0.0001490566037735849,
	"loss": 0.9989,
	"step": 64
	},
	{
	"epoch": 0.5855855855855856,
	"grad_norm": 2.505021095275879,
	"learning_rate": 0.0001481132075471698,
	"loss": 0.7453,
	"step": 65
	},
	{
	"epoch": 0.5945945945945946,
	"grad_norm": 2.384697198867798,
	"learning_rate": 0.00014716981132075472,
	"loss": 0.8239,
	"step": 66
	},
	{
	"epoch": 0.6036036036036037,
	"grad_norm": 2.4495139122009277,
	"learning_rate": 0.00014622641509433963,
	"loss": 0.7156,
	"step": 67
	},
	{
	"epoch": 0.6126126126126126,
	"grad_norm": 2.30027437210083,
	"learning_rate": 0.00014528301886792451,
	"loss": 0.7609,
	"step": 68
	},
	{
	"epoch": 0.6216216216216216,
	"grad_norm": 2.8271803855895996,
	"learning_rate": 0.00014433962264150945,
	"loss": 0.8091,
	"step": 69
	},
	{
	"epoch": 0.6306306306306306,
	"grad_norm": 3.0241498947143555,
	"learning_rate": 0.00014339622641509434,
	"loss": 1.0036,
	"step": 70
	},
	{
	"epoch": 0.6396396396396397,
	"grad_norm": 3.4984843730926514,
	"learning_rate": 0.00014245283018867925,
	"loss": 1.0187,
	"step": 71
	},
	{
	"epoch": 0.6486486486486487,
	"grad_norm": 2.9161272048950195,
	"learning_rate": 0.00014150943396226416,
	"loss": 0.8462,
	"step": 72
	},
	{
	"epoch": 0.6576576576576577,
	"grad_norm": 4.072527885437012,
	"learning_rate": 0.00014056603773584907,
	"loss": 0.8637,
	"step": 73
	},
	{
	"epoch": 0.6666666666666666,
	"grad_norm": 3.195216655731201,
	"learning_rate": 0.00013962264150943395,
	"loss": 1.0614,
	"step": 74
	},
	{
	"epoch": 0.6756756756756757,
	"grad_norm": 3.911717653274536,
	"learning_rate": 0.0001386792452830189,
	"loss": 1.0033,
	"step": 75
	},
	{
	"epoch": 0.6846846846846847,
	"grad_norm": 3.8403871059417725,
	"learning_rate": 0.00013773584905660377,
	"loss": 1.0585,
	"step": 76
	},
	{
	"epoch": 0.6936936936936937,
	"grad_norm": 3.337313413619995,
	"learning_rate": 0.00013679245283018868,
	"loss": 0.8158,
	"step": 77
	},
	{
	"epoch": 0.7027027027027027,
	"grad_norm": 2.4403326511383057,
	"learning_rate": 0.0001358490566037736,
	"loss": 0.7053,
	"step": 78
	},
	{
	"epoch": 0.7117117117117117,
	"grad_norm": 1.832205057144165,
	"learning_rate": 0.0001349056603773585,
	"loss": 0.3911,
	"step": 79
	},
	{
	"epoch": 0.7207207207207207,
	"grad_norm": 4.23843240737915,
	"learning_rate": 0.0001339622641509434,
	"loss": 1.0908,
	"step": 80
	},
	{
	"epoch": 0.7297297297297297,
	"grad_norm": 3.123248815536499,
	"learning_rate": 0.0001330188679245283,
	"loss": 1.0456,
	"step": 81
	},
	{
	"epoch": 0.7387387387387387,
	"grad_norm": 2.5214996337890625,
	"learning_rate": 0.0001320754716981132,
	"loss": 1.6093,
	"step": 82
	},
	{
	"epoch": 0.7477477477477478,
	"grad_norm": 3.6226158142089844,
	"learning_rate": 0.00013113207547169812,
	"loss": 1.0943,
	"step": 83
	},
	{
	"epoch": 0.7567567567567568,
	"grad_norm": 2.489712715148926,
	"learning_rate": 0.000130188679245283,
	"loss": 0.9914,
	"step": 84
	},
	{
	"epoch": 0.7657657657657657,
	"grad_norm": 2.5745816230773926,
	"learning_rate": 0.00012924528301886794,
	"loss": 1.0307,
	"step": 85
	},
	{
	"epoch": 0.7747747747747747,
	"grad_norm": 2.9188766479492188,
	"learning_rate": 0.00012830188679245283,
	"loss": 1.1999,
	"step": 86
	},
	{
	"epoch": 0.7837837837837838,
	"grad_norm": 2.2265069484710693,
	"learning_rate": 0.00012735849056603774,
	"loss": 0.9198,
	"step": 87
	},
	{
	"epoch": 0.7927927927927928,
	"grad_norm": 2.4985668659210205,
	"learning_rate": 0.00012641509433962265,
	"loss": 0.9371,
	"step": 88
	},
	{
	"epoch": 0.8018018018018018,
	"grad_norm": 2.92549467086792,
	"learning_rate": 0.00012547169811320756,
	"loss": 0.8774,
	"step": 89
	},
	{
	"epoch": 0.8108108108108109,
	"grad_norm": 2.601806640625,
	"learning_rate": 0.00012452830188679244,
	"loss": 0.8954,
	"step": 90
	},
	{
	"epoch": 0.8108108108108109,
	"eval_loss": 0.9327961206436157,
	"eval_runtime": 6.5961,
	"eval_samples_per_second": 3.79,
	"eval_steps_per_second": 3.79,
	"step": 90
	},
	{
	"epoch": 0.8198198198198198,
	"grad_norm": 2.4627206325531006,
	"learning_rate": 0.00012358490566037738,
	"loss": 0.9299,
	"step": 91
	},
	{
	"epoch": 0.8288288288288288,
	"grad_norm": 2.2025997638702393,
	"learning_rate": 0.00012264150943396227,
	"loss": 0.6651,
	"step": 92
	},
	{
	"epoch": 0.8378378378378378,
	"grad_norm": 1.866162896156311,
	"learning_rate": 0.00012169811320754718,
	"loss": 0.5669,
	"step": 93
	},
	{
	"epoch": 0.8468468468468469,
	"grad_norm": 3.0936880111694336,
	"learning_rate": 0.00012075471698113207,
	"loss": 0.8466,
	"step": 94
	},
	{
	"epoch": 0.8558558558558559,
	"grad_norm": 2.012234687805176,
	"learning_rate": 0.000119811320754717,
	"loss": 0.7128,
	"step": 95
	},
	{
	"epoch": 0.8648648648648649,
	"grad_norm": 3.379054069519043,
	"learning_rate": 0.00011886792452830188,
	"loss": 0.907,
	"step": 96
	},
	{
	"epoch": 0.8738738738738738,
	"grad_norm": 2.7869811058044434,
	"learning_rate": 0.00011792452830188681,
	"loss": 1.0776,
	"step": 97
	},
	{
	"epoch": 0.8828828828828829,
	"grad_norm": 2.5204427242279053,
	"learning_rate": 0.0001169811320754717,
	"loss": 0.8508,
	"step": 98
	},
	{
	"epoch": 0.8918918918918919,
	"grad_norm": 2.2128093242645264,
	"learning_rate": 0.00011603773584905662,
	"loss": 0.8263,
	"step": 99
	},
	{
	"epoch": 0.9009009009009009,
	"grad_norm": 2.7429111003875732,
	"learning_rate": 0.00011509433962264151,
	"loss": 0.8288,
	"step": 100
	},
	{
	"epoch": 0.9099099099099099,
	"grad_norm": 2.871586799621582,
	"learning_rate": 0.00011415094339622642,
	"loss": 0.9234,
	"step": 101
	},
	{
	"epoch": 0.918918918918919,
	"grad_norm": 2.1617884635925293,
	"learning_rate": 0.00011320754716981132,
	"loss": 0.5383,
	"step": 102
	},
	{
	"epoch": 0.9279279279279279,
	"grad_norm": 2.360563278198242,
	"learning_rate": 0.00011226415094339624,
	"loss": 0.7094,
	"step": 103
	},
	{
	"epoch": 0.9369369369369369,
	"grad_norm": 1.9730867147445679,
	"learning_rate": 0.00011132075471698113,
	"loss": 0.453,
	"step": 104
	},
	{
	"epoch": 0.9459459459459459,
	"grad_norm": 4.094314098358154,
	"learning_rate": 0.00011037735849056605,
	"loss": 1.3654,
	"step": 105
	},
	{
	"epoch": 0.954954954954955,
	"grad_norm": 4.355881690979004,
	"learning_rate": 0.00010943396226415095,
	"loss": 0.9246,
	"step": 106
	},
	{
	"epoch": 0.963963963963964,
	"grad_norm": 2.3971104621887207,
	"learning_rate": 0.00010849056603773586,
	"loss": 0.7235,
	"step": 107
	},
	{
	"epoch": 0.972972972972973,
	"grad_norm": 3.30466365814209,
	"learning_rate": 0.00010754716981132076,
	"loss": 0.8336,
	"step": 108
	},
	{
	"epoch": 0.9819819819819819,
	"grad_norm": 2.125714063644409,
	"learning_rate": 0.00010660377358490567,
	"loss": 0.465,
	"step": 109
	},
	{
	"epoch": 0.990990990990991,
	"grad_norm": 3.2173519134521484,
	"learning_rate": 0.00010566037735849057,
	"loss": 0.9495,
	"step": 110
	},
	{
	"epoch": 1.0,
	"grad_norm": 3.191514730453491,
	"learning_rate": 0.00010471698113207549,
	"loss": 0.627,
	"step": 111
	},
	{
	"epoch": 1.009009009009009,
	"grad_norm": 2.0435428619384766,
	"learning_rate": 0.00010377358490566037,
	"loss": 0.7347,
	"step": 112
	},
	{
	"epoch": 1.018018018018018,
	"grad_norm": 1.9830867052078247,
	"learning_rate": 0.0001028301886792453,
	"loss": 0.4888,
	"step": 113
	},
	{
	"epoch": 1.027027027027027,
	"grad_norm": 1.857865810394287,
	"learning_rate": 0.0001018867924528302,
	"loss": 0.4856,
	"step": 114
	},
	{
	"epoch": 1.0360360360360361,
	"grad_norm": 1.8800874948501587,
	"learning_rate": 0.00010094339622641511,
	"loss": 0.5167,
	"step": 115
	},
	{
	"epoch": 1.045045045045045,
	"grad_norm": 1.8338440656661987,
	"learning_rate": 0.0001,
	"loss": 0.5583,
	"step": 116
	},
	{
	"epoch": 1.054054054054054,
	"grad_norm": 1.6658433675765991,
	"learning_rate": 9.905660377358492e-05,
	"loss": 0.4015,
	"step": 117
	},
	{
	"epoch": 1.063063063063063,
	"grad_norm": 1.9992265701293945,
	"learning_rate": 9.811320754716981e-05,
	"loss": 0.6464,
	"step": 118
	},
	{
	"epoch": 1.072072072072072,
	"grad_norm": 2.135561943054199,
	"learning_rate": 9.716981132075472e-05,
	"loss": 0.5836,
	"step": 119
	},
	{
	"epoch": 1.0810810810810811,
	"grad_norm": 2.490431785583496,
	"learning_rate": 9.622641509433963e-05,
	"loss": 0.5638,
	"step": 120
	},
	{
	"epoch": 1.09009009009009,
	"grad_norm": 1.7593251466751099,
	"learning_rate": 9.528301886792453e-05,
	"loss": 0.4942,
	"step": 121
	},
	{
	"epoch": 1.0990990990990992,
	"grad_norm": 1.9234812259674072,
	"learning_rate": 9.433962264150944e-05,
	"loss": 0.4357,
	"step": 122
	},
	{
	"epoch": 1.1081081081081081,
	"grad_norm": 1.5386407375335693,
	"learning_rate": 9.339622641509434e-05,
	"loss": 0.3261,
	"step": 123
	},
	{
	"epoch": 1.117117117117117,
	"grad_norm": 1.7715940475463867,
	"learning_rate": 9.245283018867925e-05,
	"loss": 0.3987,
	"step": 124
	},
	{
	"epoch": 1.1261261261261262,
	"grad_norm": 3.185229539871216,
	"learning_rate": 9.150943396226416e-05,
	"loss": 0.6788,
	"step": 125
	},
	{
	"epoch": 1.135135135135135,
	"grad_norm": 2.2249863147735596,
	"learning_rate": 9.056603773584906e-05,
	"loss": 0.6261,
	"step": 126
	},
	{
	"epoch": 1.1441441441441442,
	"grad_norm": 1.9929211139678955,
	"learning_rate": 8.962264150943397e-05,
	"loss": 0.5346,
	"step": 127
	},
	{
	"epoch": 1.1531531531531531,
	"grad_norm": 2.3811631202697754,
	"learning_rate": 8.867924528301888e-05,
	"loss": 0.5619,
	"step": 128
	},
	{
	"epoch": 1.1621621621621623,
	"grad_norm": 2.218947649002075,
	"learning_rate": 8.773584905660378e-05,
	"loss": 0.4524,
	"step": 129
	},
	{
	"epoch": 1.1711711711711712,
	"grad_norm": 1.86408531665802,
	"learning_rate": 8.679245283018869e-05,
	"loss": 0.4306,
	"step": 130
	},
	{
	"epoch": 1.1801801801801801,
	"grad_norm": 2.1098172664642334,
	"learning_rate": 8.584905660377359e-05,
	"loss": 0.3715,
	"step": 131
	},
	{
	"epoch": 1.1891891891891893,
	"grad_norm": 1.9560171365737915,
	"learning_rate": 8.49056603773585e-05,
	"loss": 0.4217,
	"step": 132
	},
	{
	"epoch": 1.1981981981981982,
	"grad_norm": 1.9112765789031982,
	"learning_rate": 8.396226415094341e-05,
	"loss": 0.4395,
	"step": 133
	},
	{
	"epoch": 1.2072072072072073,
	"grad_norm": 2.1735923290252686,
	"learning_rate": 8.30188679245283e-05,
	"loss": 0.4511,
	"step": 134
	},
	{
	"epoch": 1.2162162162162162,
	"grad_norm": 2.0818467140197754,
	"learning_rate": 8.207547169811322e-05,
	"loss": 0.6096,
	"step": 135
	},
	{
	"epoch": 1.2162162162162162,
	"eval_loss": 0.938506543636322,
	"eval_runtime": 6.5372,
	"eval_samples_per_second": 3.824,
	"eval_steps_per_second": 3.824,
	"step": 135
	},
	{
	"epoch": 1.2252252252252251,
	"grad_norm": 2.418597936630249,
	"learning_rate": 8.113207547169813e-05,
	"loss": 0.3678,
	"step": 136
	},
	{
	"epoch": 1.2342342342342343,
	"grad_norm": 1.673642635345459,
	"learning_rate": 8.018867924528302e-05,
	"loss": 0.3768,
	"step": 137
	},
	{
	"epoch": 1.2432432432432432,
	"grad_norm": 2.2989706993103027,
	"learning_rate": 7.924528301886794e-05,
	"loss": 0.6057,
	"step": 138
	},
	{
	"epoch": 1.2522522522522523,
	"grad_norm": 8.500811576843262,
	"learning_rate": 7.830188679245283e-05,
	"loss": 1.245,
	"step": 139
	},
	{
	"epoch": 1.2612612612612613,
	"grad_norm": 2.4313371181488037,
	"learning_rate": 7.735849056603774e-05,
	"loss": 0.6303,
	"step": 140
	},
	{
	"epoch": 1.2702702702702702,
	"grad_norm": 3.1638753414154053,
	"learning_rate": 7.641509433962265e-05,
	"loss": 0.6185,
	"step": 141
	},
	{
	"epoch": 1.2792792792792793,
	"grad_norm": 1.9501088857650757,
	"learning_rate": 7.547169811320755e-05,
	"loss": 0.4823,
	"step": 142
	},
	{
	"epoch": 1.2882882882882882,
	"grad_norm": 2.220900774002075,
	"learning_rate": 7.452830188679245e-05,
	"loss": 0.5895,
	"step": 143
	},
	{
	"epoch": 1.2972972972972974,
	"grad_norm": 2.5282726287841797,
	"learning_rate": 7.358490566037736e-05,
	"loss": 0.4811,
	"step": 144
	},
	{
	"epoch": 1.3063063063063063,
	"grad_norm": 2.469804286956787,
	"learning_rate": 7.264150943396226e-05,
	"loss": 0.457,
	"step": 145
	},
	{
	"epoch": 1.3153153153153152,
	"grad_norm": 2.204465627670288,
	"learning_rate": 7.169811320754717e-05,
	"loss": 0.4434,
	"step": 146
	},
	{
	"epoch": 1.3243243243243243,
	"grad_norm": 1.7984890937805176,
	"learning_rate": 7.075471698113208e-05,
	"loss": 0.521,
	"step": 147
	},
	{
	"epoch": 1.3333333333333333,
	"grad_norm": 2.258803367614746,
	"learning_rate": 6.981132075471698e-05,
	"loss": 0.5292,
	"step": 148
	},
	{
	"epoch": 1.3423423423423424,
	"grad_norm": 2.650085210800171,
	"learning_rate": 6.886792452830189e-05,
	"loss": 0.5947,
	"step": 149
	},
	{
	"epoch": 1.3513513513513513,
	"grad_norm": 2.604031801223755,
	"learning_rate": 6.79245283018868e-05,
	"loss": 0.4763,
	"step": 150
	},
	{
	"epoch": 1.3603603603603602,
	"grad_norm": 1.9394235610961914,
	"learning_rate": 6.69811320754717e-05,
	"loss": 0.3715,
	"step": 151
	},
	{
	"epoch": 1.3693693693693694,
	"grad_norm": 2.0028188228607178,
	"learning_rate": 6.60377358490566e-05,
	"loss": 0.4297,
	"step": 152
	},
	{
	"epoch": 1.3783783783783785,
	"grad_norm": 2.303760528564453,
	"learning_rate": 6.50943396226415e-05,
	"loss": 0.2321,
	"step": 153
	},
	{
	"epoch": 1.3873873873873874,
	"grad_norm": 2.2970705032348633,
	"learning_rate": 6.415094339622641e-05,
	"loss": 0.634,
	"step": 154
	},
	{
	"epoch": 1.3963963963963963,
	"grad_norm": 2.137401580810547,
	"learning_rate": 6.320754716981132e-05,
	"loss": 0.6179,
	"step": 155
	},
	{
	"epoch": 1.4054054054054055,
	"grad_norm": 3.294367790222168,
	"learning_rate": 6.226415094339622e-05,
	"loss": 0.5386,
	"step": 156
	},
	{
	"epoch": 1.4144144144144144,
	"grad_norm": 1.749168038368225,
	"learning_rate": 6.132075471698113e-05,
	"loss": 0.4541,
	"step": 157
	},
	{
	"epoch": 1.4234234234234235,
	"grad_norm": 2.023388147354126,
	"learning_rate": 6.037735849056604e-05,
	"loss": 0.613,
	"step": 158
	},
	{
	"epoch": 1.4324324324324325,
	"grad_norm": 2.0432538986206055,
	"learning_rate": 5.943396226415094e-05,
	"loss": 0.4197,
	"step": 159
	},
	{
	"epoch": 1.4414414414414414,
	"grad_norm": 1.9854912757873535,
	"learning_rate": 5.849056603773585e-05,
	"loss": 0.405,
	"step": 160
	},
	{
	"epoch": 1.4504504504504505,
	"grad_norm": 1.736327886581421,
	"learning_rate": 5.7547169811320756e-05,
	"loss": 0.4641,
	"step": 161
	},
	{
	"epoch": 1.4594594594594594,
	"grad_norm": 2.002995729446411,
	"learning_rate": 5.660377358490566e-05,
	"loss": 0.4525,
	"step": 162
	},
	{
	"epoch": 1.4684684684684686,
	"grad_norm": 1.5092021226882935,
	"learning_rate": 5.5660377358490564e-05,
	"loss": 0.4011,
	"step": 163
	},
	{
	"epoch": 1.4774774774774775,
	"grad_norm": 2.3325278759002686,
	"learning_rate": 5.4716981132075475e-05,
	"loss": 0.5904,
	"step": 164
	},
	{
	"epoch": 1.4864864864864864,
	"grad_norm": 1.8346065282821655,
	"learning_rate": 5.377358490566038e-05,
	"loss": 0.4321,
	"step": 165
	},
	{
	"epoch": 1.4954954954954955,
	"grad_norm": 2.4442012310028076,
	"learning_rate": 5.283018867924528e-05,
	"loss": 0.6115,
	"step": 166
	},
	{
	"epoch": 1.5045045045045045,
	"grad_norm": 1.691246509552002,
	"learning_rate": 5.188679245283019e-05,
	"loss": 0.5062,
	"step": 167
	},
	{
	"epoch": 1.5135135135135136,
	"grad_norm": 1.7400901317596436,
	"learning_rate": 5.09433962264151e-05,
	"loss": 0.6291,
	"step": 168
	},
	{
	"epoch": 1.5225225225225225,
	"grad_norm": 1.753957748413086,
	"learning_rate": 5e-05,
	"loss": 0.5185,
	"step": 169
	},
	{
	"epoch": 1.5315315315315314,
	"grad_norm": 2.275676965713501,
	"learning_rate": 4.9056603773584906e-05,
	"loss": 0.6709,
	"step": 170
	},
	{
	"epoch": 1.5405405405405406,
	"grad_norm": 1.8429116010665894,
	"learning_rate": 4.811320754716982e-05,
	"loss": 0.471,
	"step": 171
	},
	{
	"epoch": 1.5495495495495497,
	"grad_norm": 2.2381837368011475,
	"learning_rate": 4.716981132075472e-05,
	"loss": 0.5187,
	"step": 172
	},
	{
	"epoch": 1.5585585585585586,
	"grad_norm": 1.5086891651153564,
	"learning_rate": 4.6226415094339625e-05,
	"loss": 0.4119,
	"step": 173
	},
	{
	"epoch": 1.5675675675675675,
	"grad_norm": 1.8084434270858765,
	"learning_rate": 4.528301886792453e-05,
	"loss": 0.4377,
	"step": 174
	},
	{
	"epoch": 1.5765765765765765,
	"grad_norm": 1.5693408250808716,
	"learning_rate": 4.433962264150944e-05,
	"loss": 0.367,
	"step": 175
	},
	{
	"epoch": 1.5855855855855856,
	"grad_norm": 1.929656744003296,
	"learning_rate": 4.3396226415094345e-05,
	"loss": 0.5566,
	"step": 176
	},
	{
	"epoch": 1.5945945945945947,
	"grad_norm": 1.9675954580307007,
	"learning_rate": 4.245283018867925e-05,
	"loss": 0.3592,
	"step": 177
	},
	{
	"epoch": 1.6036036036036037,
	"grad_norm": 2.125093698501587,
	"learning_rate": 4.150943396226415e-05,
	"loss": 0.6002,
	"step": 178
	},
	{
	"epoch": 1.6126126126126126,
	"grad_norm": 1.3417986631393433,
	"learning_rate": 4.0566037735849064e-05,
	"loss": 0.3906,
	"step": 179
	},
	{
	"epoch": 1.6216216216216215,
	"grad_norm": 1.925667405128479,
	"learning_rate": 3.962264150943397e-05,
	"loss": 0.5283,
	"step": 180
	},
	{
	"epoch": 1.6216216216216215,
	"eval_loss": 0.8673837184906006,
	"eval_runtime": 6.5567,
	"eval_samples_per_second": 3.813,
	"eval_steps_per_second": 3.813,
	"step": 180
	},
	{
	"epoch": 1.6306306306306306,
	"grad_norm": 1.935304045677185,
	"learning_rate": 3.867924528301887e-05,
	"loss": 0.5695,
	"step": 181
	},
	{
	"epoch": 1.6396396396396398,
	"grad_norm": 2.1937944889068604,
	"learning_rate": 3.7735849056603776e-05,
	"loss": 0.5156,
	"step": 182
	},
	{
	"epoch": 1.6486486486486487,
	"grad_norm": 2.2117178440093994,
	"learning_rate": 3.679245283018868e-05,
	"loss": 0.6712,
	"step": 183
	},
	{
	"epoch": 1.6576576576576576,
	"grad_norm": 1.7522823810577393,
	"learning_rate": 3.5849056603773584e-05,
	"loss": 0.4351,
	"step": 184
	},
	{
	"epoch": 1.6666666666666665,
	"grad_norm": 1.811461091041565,
	"learning_rate": 3.490566037735849e-05,
	"loss": 0.45,
	"step": 185
	},
	{
	"epoch": 1.6756756756756757,
	"grad_norm": 1.9774659872055054,
	"learning_rate": 3.39622641509434e-05,
	"loss": 0.4842,
	"step": 186
	},
	{
	"epoch": 1.6846846846846848,
	"grad_norm": 2.0509095191955566,
	"learning_rate": 3.30188679245283e-05,
	"loss": 0.3627,
	"step": 187
	},
	{
	"epoch": 1.6936936936936937,
	"grad_norm": 1.9894822835922241,
	"learning_rate": 3.207547169811321e-05,
	"loss": 0.3972,
	"step": 188
	},
	{
	"epoch": 1.7027027027027026,
	"grad_norm": 2.2051963806152344,
	"learning_rate": 3.113207547169811e-05,
	"loss": 0.4134,
	"step": 189
	},
	{
	"epoch": 1.7117117117117115,
	"grad_norm": 1.8894035816192627,
	"learning_rate": 3.018867924528302e-05,
	"loss": 0.4331,
	"step": 190
	},
	{
	"epoch": 1.7207207207207207,
	"grad_norm": 1.7276670932769775,
	"learning_rate": 2.9245283018867926e-05,
	"loss": 0.4131,
	"step": 191
	},
	{
	"epoch": 1.7297297297297298,
	"grad_norm": 2.4766604900360107,
	"learning_rate": 2.830188679245283e-05,
	"loss": 0.4826,
	"step": 192
	},
	{
	"epoch": 1.7387387387387387,
	"grad_norm": 1.2023714780807495,
	"learning_rate": 2.7358490566037738e-05,
	"loss": 0.3832,
	"step": 193
	},
	{
	"epoch": 1.7477477477477477,
	"grad_norm": 1.8278062343597412,
	"learning_rate": 2.641509433962264e-05,
	"loss": 0.5278,
	"step": 194
	},
	{
	"epoch": 1.7567567567567568,
	"grad_norm": 1.6827036142349243,
	"learning_rate": 2.547169811320755e-05,
	"loss": 0.5348,
	"step": 195
	},
	{
	"epoch": 1.7657657657657657,
	"grad_norm": 1.076462745666504,
	"learning_rate": 2.4528301886792453e-05,
	"loss": 0.1987,
	"step": 196
	},
	{
	"epoch": 1.7747747747747749,
	"grad_norm": 1.7320923805236816,
	"learning_rate": 2.358490566037736e-05,
	"loss": 0.4077,
	"step": 197
	},
	{
	"epoch": 1.7837837837837838,
	"grad_norm": 1.6438912153244019,
	"learning_rate": 2.2641509433962265e-05,
	"loss": 0.5042,
	"step": 198
	},
	{
	"epoch": 1.7927927927927927,
	"grad_norm": 2.4654927253723145,
	"learning_rate": 2.1698113207547172e-05,
	"loss": 0.5927,
	"step": 199
	},
	{
	"epoch": 1.8018018018018018,
	"grad_norm": 2.1257989406585693,
	"learning_rate": 2.0754716981132076e-05,
	"loss": 0.4004,
	"step": 200
	},
	{
	"epoch": 1.810810810810811,
	"grad_norm": 2.446303367614746,
	"learning_rate": 1.9811320754716984e-05,
	"loss": 0.5578,
	"step": 201
	},
	{
	"epoch": 1.8198198198198199,
	"grad_norm": 1.926680326461792,
	"learning_rate": 1.8867924528301888e-05,
	"loss": 0.4442,
	"step": 202
	},
	{
	"epoch": 1.8288288288288288,
	"grad_norm": 1.7860006093978882,
	"learning_rate": 1.7924528301886792e-05,
	"loss": 0.3535,
	"step": 203
	},
	{
	"epoch": 1.8378378378378377,
	"grad_norm": 1.7057844400405884,
	"learning_rate": 1.69811320754717e-05,
	"loss": 0.4485,
	"step": 204
	},
	{
	"epoch": 1.8468468468468469,
	"grad_norm": 1.8940867185592651,
	"learning_rate": 1.6037735849056604e-05,
	"loss": 0.3801,
	"step": 205
	},
	{
	"epoch": 1.855855855855856,
	"grad_norm": 2.496486186981201,
	"learning_rate": 1.509433962264151e-05,
	"loss": 0.4679,
	"step": 206
	},
	{
	"epoch": 1.864864864864865,
	"grad_norm": 1.964465618133545,
	"learning_rate": 1.4150943396226415e-05,
	"loss": 0.4486,
	"step": 207
	},
	{
	"epoch": 1.8738738738738738,
	"grad_norm": 1.72641921043396,
	"learning_rate": 1.320754716981132e-05,
	"loss": 0.2787,
	"step": 208
	},
	{
	"epoch": 1.8828828828828827,
	"grad_norm": 2.9086968898773193,
	"learning_rate": 1.2264150943396227e-05,
	"loss": 0.6431,
	"step": 209
	},
	{
	"epoch": 1.8918918918918919,
	"grad_norm": 2.210435628890991,
	"learning_rate": 1.1320754716981132e-05,
	"loss": 0.3728,
	"step": 210
	},
	{
	"epoch": 1.900900900900901,
	"grad_norm": 3.138334274291992,
	"learning_rate": 1.0377358490566038e-05,
	"loss": 0.4852,
	"step": 211
	},
	{
	"epoch": 1.90990990990991,
	"grad_norm": 2.495166063308716,
	"learning_rate": 9.433962264150944e-06,
	"loss": 0.6174,
	"step": 212
	},
	{
	"epoch": 1.9189189189189189,
	"grad_norm": 1.8811306953430176,
	"learning_rate": 8.49056603773585e-06,
	"loss": 0.3751,
	"step": 213
	},
	{
	"epoch": 1.9279279279279278,
	"grad_norm": 2.646571159362793,
	"learning_rate": 7.547169811320755e-06,
	"loss": 0.5507,
	"step": 214
	},
	{
	"epoch": 1.936936936936937,
	"grad_norm": 2.1849875450134277,
	"learning_rate": 6.60377358490566e-06,
	"loss": 0.3505,
	"step": 215
	},
	{
	"epoch": 1.945945945945946,
	"grad_norm": 2.030783176422119,
	"learning_rate": 5.660377358490566e-06,
	"loss": 0.3739,
	"step": 216
	},
	{
	"epoch": 1.954954954954955,
	"grad_norm": 2.0192530155181885,
	"learning_rate": 4.716981132075472e-06,
	"loss": 0.3691,
	"step": 217
	},
	{
	"epoch": 1.9639639639639639,
	"grad_norm": 2.651379346847534,
	"learning_rate": 3.7735849056603773e-06,
	"loss": 0.652,
	"step": 218
	},
	{
	"epoch": 1.972972972972973,
	"grad_norm": 2.3628175258636475,
	"learning_rate": 2.830188679245283e-06,
	"loss": 0.4631,
	"step": 219
	},
	{
	"epoch": 1.981981981981982,
	"grad_norm": 2.846590518951416,
	"learning_rate": 1.8867924528301887e-06,
	"loss": 0.5564,
	"step": 220
	},
	{
	"epoch": 1.990990990990991,
	"grad_norm": 1.986903429031372,
	"learning_rate": 9.433962264150943e-07,
	"loss": 0.3621,
	"step": 221
	},
	{
	"epoch": 2.0,
	"grad_norm": 2.368159770965576,
	"learning_rate": 0.0,
	"loss": 0.4097,
	"step": 222
	}
	],
	"logging_steps": 1,
	"max_steps": 222,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 1024624247734272.0,
	"train_batch_size": 1,
	"trial_name": null,
	"trial_params": null
	}