Phương commited on
Commit
069572c
·
1 Parent(s): f3bb109

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+
18
+ The following `bitsandbytes` quantization config was used during training:
19
+ - load_in_8bit: True
20
+ - load_in_4bit: False
21
+ - llm_int8_threshold: 6.0
22
+ - llm_int8_skip_modules: None
23
+ - llm_int8_enable_fp32_cpu_offload: False
24
+ - llm_int8_has_fp16_weight: False
25
+ - bnb_4bit_quant_type: fp4
26
+ - bnb_4bit_use_double_quant: False
27
+ - bnb_4bit_compute_dtype: float32
28
+ ### Framework versions
29
+
30
+ - PEFT 0.4.0
31
+
32
+ - PEFT 0.4.0
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c830e48b17a3ddc768effe4ad6621e1d95161c3a18b1d4163dae933618603c36
3
+ size 11069613
checkpoint-200/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-200/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-200/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbec6410587fffddeb55ecd4844e7963615c055b4ec220cf5b6e78df4471400
3
+ size 11069613
checkpoint-200/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-200/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-200/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdbec6410587fffddeb55ecd4844e7963615c055b4ec220cf5b6e78df4471400
3
+ size 11069613
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6406621bb551c75cb5986f5257154101e5535fcc3e3b7ffdb4d67eb2025bb25a
3
+ size 2852293
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8839cdafbeb5ee30636918c46809ccb433603424aa23b658cf5ea835ffc00a1
3
+ size 14575
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50161b59cf45f2db4b9567ed721d07428ab6f3537708b36e3dc0a7eeb3e6bad1
3
+ size 627
checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,1536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.380952380952381,
5
+ "global_step": 200,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.6335,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.5176,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.04,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.4883,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.05,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.6,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.06,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.5088,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "eval_loss": 1.7048434019088745,
43
+ "eval_runtime": 2.1875,
44
+ "eval_samples_per_second": 0.914,
45
+ "eval_steps_per_second": 0.457,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.07,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.4985,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.4626,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.1,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.3285,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.11,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.6476,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.12,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.5266,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.12,
80
+ "eval_loss": 1.692796230316162,
81
+ "eval_runtime": 2.1867,
82
+ "eval_samples_per_second": 0.915,
83
+ "eval_steps_per_second": 0.457,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.13,
88
+ "learning_rate": 0.0001999915737775817,
89
+ "loss": 1.6152,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.14,
94
+ "learning_rate": 0.00019996629653035126,
95
+ "loss": 1.505,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.15,
100
+ "learning_rate": 0.00019992417251814282,
101
+ "loss": 1.3107,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.17,
106
+ "learning_rate": 0.00019986520883988232,
107
+ "loss": 1.3979,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.18,
112
+ "learning_rate": 0.0001997894154323911,
113
+ "loss": 1.2276,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.18,
118
+ "eval_loss": 1.6662951707839966,
119
+ "eval_runtime": 2.186,
120
+ "eval_samples_per_second": 0.915,
121
+ "eval_steps_per_second": 0.457,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.19,
126
+ "learning_rate": 0.00019969680506871137,
127
+ "loss": 1.7369,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.2,
132
+ "learning_rate": 0.0001995873933559535,
133
+ "loss": 1.6659,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.21,
138
+ "learning_rate": 0.00019946119873266613,
139
+ "loss": 1.1324,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.23,
144
+ "learning_rate": 0.0001993182424657285,
145
+ "loss": 1.9695,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.24,
150
+ "learning_rate": 0.00019915854864676664,
151
+ "loss": 2.5525,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.24,
156
+ "eval_loss": 1.6528061628341675,
157
+ "eval_runtime": 2.1857,
158
+ "eval_samples_per_second": 0.915,
159
+ "eval_steps_per_second": 0.458,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.25,
164
+ "learning_rate": 0.0001989821441880933,
165
+ "loss": 1.3183,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.26,
170
+ "learning_rate": 0.00019878905881817252,
171
+ "loss": 1.5486,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.27,
176
+ "learning_rate": 0.0001985793250766098,
177
+ "loss": 1.5504,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.29,
182
+ "learning_rate": 0.00019835297830866826,
183
+ "loss": 1.4022,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.3,
188
+ "learning_rate": 0.00019811005665931205,
189
+ "loss": 1.4385,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.3,
194
+ "eval_loss": 1.6313893795013428,
195
+ "eval_runtime": 2.1922,
196
+ "eval_samples_per_second": 0.912,
197
+ "eval_steps_per_second": 0.456,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.31,
202
+ "learning_rate": 0.00019785060106677818,
203
+ "loss": 1.4413,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.32,
208
+ "learning_rate": 0.0001975746552556772,
209
+ "loss": 1.2569,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.33,
214
+ "learning_rate": 0.00019728226572962473,
215
+ "loss": 1.4904,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.35,
220
+ "learning_rate": 0.0001969734817634044,
221
+ "loss": 1.5558,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.36,
226
+ "learning_rate": 0.0001966483553946637,
227
+ "loss": 1.2282,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.36,
232
+ "eval_loss": 1.6210927963256836,
233
+ "eval_runtime": 2.1889,
234
+ "eval_samples_per_second": 0.914,
235
+ "eval_steps_per_second": 0.457,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.37,
240
+ "learning_rate": 0.00019630694141514464,
241
+ "loss": 1.4598,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.38,
246
+ "learning_rate": 0.00019594929736144976,
247
+ "loss": 1.48,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.39,
252
+ "learning_rate": 0.0001955754835053459,
253
+ "loss": 1.3934,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.4,
258
+ "learning_rate": 0.00019518556284360696,
259
+ "loss": 1.1312,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.42,
264
+ "learning_rate": 0.0001947796010873974,
265
+ "loss": 1.6493,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.42,
270
+ "eval_loss": 1.6185638904571533,
271
+ "eval_runtime": 2.1911,
272
+ "eval_samples_per_second": 0.913,
273
+ "eval_steps_per_second": 0.456,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.43,
278
+ "learning_rate": 0.0001943576666511982,
279
+ "loss": 1.587,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.44,
284
+ "learning_rate": 0.0001939198306412775,
285
+ "loss": 1.5798,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.45,
290
+ "learning_rate": 0.0001934661668437073,
291
+ "loss": 1.4308,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.46,
296
+ "learning_rate": 0.0001929967517119289,
297
+ "loss": 1.0766,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.48,
302
+ "learning_rate": 0.0001925116643538684,
303
+ "loss": 2.082,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.48,
308
+ "eval_loss": 1.633681297302246,
309
+ "eval_runtime": 2.1862,
310
+ "eval_samples_per_second": 0.915,
311
+ "eval_steps_per_second": 0.457,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.49,
316
+ "learning_rate": 0.0001920109865186052,
317
+ "loss": 1.8061,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.5,
322
+ "learning_rate": 0.00019149480258259533,
323
+ "loss": 1.4312,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.51,
328
+ "learning_rate": 0.00019096319953545185,
329
+ "loss": 1.737,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.52,
334
+ "learning_rate": 0.00019041626696528503,
335
+ "loss": 1.5035,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.54,
340
+ "learning_rate": 0.00018985409704360456,
341
+ "loss": 1.4689,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.54,
346
+ "eval_loss": 1.6150808334350586,
347
+ "eval_runtime": 2.1952,
348
+ "eval_samples_per_second": 0.911,
349
+ "eval_steps_per_second": 0.456,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.55,
354
+ "learning_rate": 0.0001892767845097864,
355
+ "loss": 1.2483,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.56,
360
+ "learning_rate": 0.00018868442665510678,
361
+ "loss": 1.1436,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.57,
366
+ "learning_rate": 0.00018807712330634642,
367
+ "loss": 1.0488,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.58,
372
+ "learning_rate": 0.00018745497680896722,
373
+ "loss": 1.3745,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 0.6,
378
+ "learning_rate": 0.0001868180920098644,
379
+ "loss": 0.9061,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 0.6,
384
+ "eval_loss": 1.6097811460494995,
385
+ "eval_runtime": 2.1875,
386
+ "eval_samples_per_second": 0.914,
387
+ "eval_steps_per_second": 0.457,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 0.61,
392
+ "learning_rate": 0.0001861665762396974,
393
+ "loss": 1.1305,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 0.62,
398
+ "learning_rate": 0.00018550053929480202,
399
+ "loss": 1.2315,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 0.63,
404
+ "learning_rate": 0.00018482009341868697,
405
+ "loss": 1.4964,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 0.64,
410
+ "learning_rate": 0.00018412535328311814,
411
+ "loss": 1.0928,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 0.65,
416
+ "learning_rate": 0.00018341643596879367,
417
+ "loss": 0.9473,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.65,
422
+ "eval_loss": 1.6084190607070923,
423
+ "eval_runtime": 2.1902,
424
+ "eval_samples_per_second": 0.913,
425
+ "eval_steps_per_second": 0.457,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 0.67,
430
+ "learning_rate": 0.0001826934609456129,
431
+ "loss": 1.362,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 0.68,
436
+ "learning_rate": 0.00018195655005254273,
437
+ "loss": 1.5478,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 0.69,
442
+ "learning_rate": 0.00018120582747708502,
443
+ "loss": 1.4831,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 0.7,
448
+ "learning_rate": 0.00018044141973434758,
449
+ "loss": 1.7483,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 0.71,
454
+ "learning_rate": 0.0001796634556457236,
455
+ "loss": 1.4993,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 0.71,
460
+ "eval_loss": 1.6235202550888062,
461
+ "eval_runtime": 2.1859,
462
+ "eval_samples_per_second": 0.915,
463
+ "eval_steps_per_second": 0.457,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 0.73,
468
+ "learning_rate": 0.00017887206631718203,
469
+ "loss": 1.5076,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 0.74,
474
+ "learning_rate": 0.0001780673851171728,
475
+ "loss": 1.6395,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 0.75,
480
+ "learning_rate": 0.00017724954765415137,
481
+ "loss": 1.6389,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 0.76,
486
+ "learning_rate": 0.00017641869175372493,
487
+ "loss": 1.7769,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 0.77,
492
+ "learning_rate": 0.00017557495743542585,
493
+ "loss": 1.2022,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 0.77,
498
+ "eval_loss": 1.6078369617462158,
499
+ "eval_runtime": 2.1883,
500
+ "eval_samples_per_second": 0.914,
501
+ "eval_steps_per_second": 0.457,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 0.79,
506
+ "learning_rate": 0.00017471848688911464,
507
+ "loss": 1.5265,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 0.8,
512
+ "learning_rate": 0.00017384942445101772,
513
+ "loss": 1.4065,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 0.81,
518
+ "learning_rate": 0.000172967916579403,
519
+ "loss": 1.4326,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 0.82,
524
+ "learning_rate": 0.00017207411182989832,
525
+ "loss": 1.571,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 0.83,
530
+ "learning_rate": 0.00017116816083045602,
531
+ "loss": 1.5233,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 0.83,
536
+ "eval_loss": 1.6035966873168945,
537
+ "eval_runtime": 2.1894,
538
+ "eval_samples_per_second": 0.914,
539
+ "eval_steps_per_second": 0.457,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 0.85,
544
+ "learning_rate": 0.00017025021625596853,
545
+ "loss": 1.5745,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 0.86,
550
+ "learning_rate": 0.0001693204328025389,
551
+ "loss": 1.608,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 0.87,
556
+ "learning_rate": 0.0001683789671614107,
557
+ "loss": 1.4234,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 0.88,
562
+ "learning_rate": 0.00016742597799256182,
563
+ "loss": 1.2839,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 0.89,
568
+ "learning_rate": 0.00016646162589796615,
569
+ "loss": 1.3248,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 0.89,
574
+ "eval_loss": 1.6052225828170776,
575
+ "eval_runtime": 2.1914,
576
+ "eval_samples_per_second": 0.913,
577
+ "eval_steps_per_second": 0.456,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 0.9,
582
+ "learning_rate": 0.00016548607339452853,
583
+ "loss": 1.0683,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 0.92,
588
+ "learning_rate": 0.00016449948488669639,
589
+ "loss": 1.5298,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 0.93,
594
+ "learning_rate": 0.00016350202663875386,
595
+ "loss": 1.5696,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 0.94,
600
+ "learning_rate": 0.00016249386674680184,
601
+ "loss": 1.1743,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 0.95,
606
+ "learning_rate": 0.0001614751751104301,
607
+ "loss": 1.8626,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 0.95,
612
+ "eval_loss": 1.6131467819213867,
613
+ "eval_runtime": 2.1907,
614
+ "eval_samples_per_second": 0.913,
615
+ "eval_steps_per_second": 0.456,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 0.96,
620
+ "learning_rate": 0.00016044612340408466,
621
+ "loss": 1.4832,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 0.98,
626
+ "learning_rate": 0.00015940688504813662,
627
+ "loss": 1.4476,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 0.99,
632
+ "learning_rate": 0.00015835763517965673,
633
+ "loss": 1.3783,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.0,
638
+ "learning_rate": 0.00015729855062290022,
639
+ "loss": 1.6671,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.01,
644
+ "learning_rate": 0.0001562298098595078,
645
+ "loss": 1.4658,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.01,
650
+ "eval_loss": 1.6062182188034058,
651
+ "eval_runtime": 2.1944,
652
+ "eval_samples_per_second": 0.911,
653
+ "eval_steps_per_second": 0.456,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.02,
658
+ "learning_rate": 0.00015515159299842707,
659
+ "loss": 1.64,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.04,
664
+ "learning_rate": 0.00015406408174555976,
665
+ "loss": 1.2125,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.05,
670
+ "learning_rate": 0.00015296745937313987,
671
+ "loss": 1.5001,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.06,
676
+ "learning_rate": 0.00015186191068884775,
677
+ "loss": 1.4294,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.07,
682
+ "learning_rate": 0.00015074762200466556,
683
+ "loss": 1.3162,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.07,
688
+ "eval_loss": 1.5980761051177979,
689
+ "eval_runtime": 2.191,
690
+ "eval_samples_per_second": 0.913,
691
+ "eval_steps_per_second": 0.456,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.08,
696
+ "learning_rate": 0.00014962478110547918,
697
+ "loss": 1.3707,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.1,
702
+ "learning_rate": 0.00014849357721743168,
703
+ "loss": 1.4644,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.11,
708
+ "learning_rate": 0.0001473542009760343,
709
+ "loss": 1.427,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.12,
714
+ "learning_rate": 0.00014620684439403962,
715
+ "loss": 1.3337,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.13,
720
+ "learning_rate": 0.0001450517008290827,
721
+ "loss": 1.4111,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.13,
726
+ "eval_loss": 1.5972555875778198,
727
+ "eval_runtime": 2.1901,
728
+ "eval_samples_per_second": 0.913,
729
+ "eval_steps_per_second": 0.457,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.14,
734
+ "learning_rate": 0.0001438889649510956,
735
+ "loss": 1.441,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.15,
740
+ "learning_rate": 0.00014271883270950073,
741
+ "loss": 0.7424,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.17,
746
+ "learning_rate": 0.00014154150130018866,
747
+ "loss": 1.3582,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.18,
752
+ "learning_rate": 0.00014035716913228568,
753
+ "loss": 1.3479,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 1.19,
758
+ "learning_rate": 0.00013916603579471705,
759
+ "loss": 1.2211,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 1.19,
764
+ "eval_loss": 1.6037462949752808,
765
+ "eval_runtime": 2.1932,
766
+ "eval_samples_per_second": 0.912,
767
+ "eval_steps_per_second": 0.456,
768
+ "step": 100
769
+ },
770
+ {
771
+ "epoch": 1.2,
772
+ "learning_rate": 0.0001379683020225714,
773
+ "loss": 2.0387,
774
+ "step": 101
775
+ },
776
+ {
777
+ "epoch": 1.21,
778
+ "learning_rate": 0.000136764169663272,
779
+ "loss": 1.3237,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 1.23,
784
+ "learning_rate": 0.00013555384164256048,
785
+ "loss": 1.4286,
786
+ "step": 103
787
+ },
788
+ {
789
+ "epoch": 1.24,
790
+ "learning_rate": 0.00013433752193029886,
791
+ "loss": 1.7905,
792
+ "step": 104
793
+ },
794
+ {
795
+ "epoch": 1.25,
796
+ "learning_rate": 0.00013311541550609565,
797
+ "loss": 1.7277,
798
+ "step": 105
799
+ },
800
+ {
801
+ "epoch": 1.25,
802
+ "eval_loss": 1.5989317893981934,
803
+ "eval_runtime": 2.1896,
804
+ "eval_samples_per_second": 0.913,
805
+ "eval_steps_per_second": 0.457,
806
+ "step": 105
807
+ },
808
+ {
809
+ "epoch": 1.26,
810
+ "learning_rate": 0.00013188772832476188,
811
+ "loss": 1.5016,
812
+ "step": 106
813
+ },
814
+ {
815
+ "epoch": 1.27,
816
+ "learning_rate": 0.00013065466728160252,
817
+ "loss": 1.7159,
818
+ "step": 107
819
+ },
820
+ {
821
+ "epoch": 1.29,
822
+ "learning_rate": 0.00012941644017754964,
823
+ "loss": 1.2701,
824
+ "step": 108
825
+ },
826
+ {
827
+ "epoch": 1.3,
828
+ "learning_rate": 0.00012817325568414297,
829
+ "loss": 1.4085,
830
+ "step": 109
831
+ },
832
+ {
833
+ "epoch": 1.31,
834
+ "learning_rate": 0.00012692532330836346,
835
+ "loss": 1.246,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 1.31,
840
+ "eval_loss": 1.597010850906372,
841
+ "eval_runtime": 2.1881,
842
+ "eval_samples_per_second": 0.914,
843
+ "eval_steps_per_second": 0.457,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 1.32,
848
+ "learning_rate": 0.00012567285335732633,
849
+ "loss": 1.3382,
850
+ "step": 111
851
+ },
852
+ {
853
+ "epoch": 1.33,
854
+ "learning_rate": 0.00012441605690283915,
855
+ "loss": 0.9305,
856
+ "step": 112
857
+ },
858
+ {
859
+ "epoch": 1.35,
860
+ "learning_rate": 0.00012315514574583113,
861
+ "loss": 1.388,
862
+ "step": 113
863
+ },
864
+ {
865
+ "epoch": 1.36,
866
+ "learning_rate": 0.0001218903323806595,
867
+ "loss": 1.2634,
868
+ "step": 114
869
+ },
870
+ {
871
+ "epoch": 1.37,
872
+ "learning_rate": 0.00012062182995929882,
873
+ "loss": 1.1971,
874
+ "step": 115
875
+ },
876
+ {
877
+ "epoch": 1.37,
878
+ "eval_loss": 1.5930073261260986,
879
+ "eval_runtime": 2.1881,
880
+ "eval_samples_per_second": 0.914,
881
+ "eval_steps_per_second": 0.457,
882
+ "step": 115
883
+ },
884
+ {
885
+ "epoch": 1.38,
886
+ "learning_rate": 0.00011934985225541998,
887
+ "loss": 1.2645,
888
+ "step": 116
889
+ },
890
+ {
891
+ "epoch": 1.39,
892
+ "learning_rate": 0.0001180746136283638,
893
+ "loss": 1.6775,
894
+ "step": 117
895
+ },
896
+ {
897
+ "epoch": 1.4,
898
+ "learning_rate": 0.00011679632898701649,
899
+ "loss": 1.018,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 1.42,
904
+ "learning_rate": 0.00011551521375359206,
905
+ "loss": 1.225,
906
+ "step": 119
907
+ },
908
+ {
909
+ "epoch": 1.43,
910
+ "learning_rate": 0.00011423148382732853,
911
+ "loss": 1.166,
912
+ "step": 120
913
+ },
914
+ {
915
+ "epoch": 1.43,
916
+ "eval_loss": 1.593321681022644,
917
+ "eval_runtime": 2.1858,
918
+ "eval_samples_per_second": 0.915,
919
+ "eval_steps_per_second": 0.457,
920
+ "step": 120
921
+ },
922
+ {
923
+ "epoch": 1.44,
924
+ "learning_rate": 0.00011294535554810354,
925
+ "loss": 1.7995,
926
+ "step": 121
927
+ },
928
+ {
929
+ "epoch": 1.45,
930
+ "learning_rate": 0.00011165704565997593,
931
+ "loss": 0.7254,
932
+ "step": 122
933
+ },
934
+ {
935
+ "epoch": 1.46,
936
+ "learning_rate": 0.00011036677127465889,
937
+ "loss": 1.4558,
938
+ "step": 123
939
+ },
940
+ {
941
+ "epoch": 1.48,
942
+ "learning_rate": 0.00010907474983493144,
943
+ "loss": 1.5358,
944
+ "step": 124
945
+ },
946
+ {
947
+ "epoch": 1.49,
948
+ "learning_rate": 0.00010778119907799398,
949
+ "loss": 1.5007,
950
+ "step": 125
951
+ },
952
+ {
953
+ "epoch": 1.49,
954
+ "eval_loss": 1.5938643217086792,
955
+ "eval_runtime": 2.189,
956
+ "eval_samples_per_second": 0.914,
957
+ "eval_steps_per_second": 0.457,
958
+ "step": 125
959
+ },
960
+ {
961
+ "epoch": 1.5,
962
+ "learning_rate": 0.0001064863369987743,
963
+ "loss": 1.6357,
964
+ "step": 126
965
+ },
966
+ {
967
+ "epoch": 1.51,
968
+ "learning_rate": 0.00010519038181318999,
969
+ "loss": 1.7524,
970
+ "step": 127
971
+ },
972
+ {
973
+ "epoch": 1.52,
974
+ "learning_rate": 0.00010389355192137377,
975
+ "loss": 1.6955,
976
+ "step": 128
977
+ },
978
+ {
979
+ "epoch": 1.54,
980
+ "learning_rate": 0.00010259606587086783,
981
+ "loss": 1.4174,
982
+ "step": 129
983
+ },
984
+ {
985
+ "epoch": 1.55,
986
+ "learning_rate": 0.0001012981423197931,
987
+ "loss": 1.2135,
988
+ "step": 130
989
+ },
990
+ {
991
+ "epoch": 1.55,
992
+ "eval_loss": 1.5910111665725708,
993
+ "eval_runtime": 2.1873,
994
+ "eval_samples_per_second": 0.914,
995
+ "eval_steps_per_second": 0.457,
996
+ "step": 130
997
+ },
998
+ {
999
+ "epoch": 1.56,
1000
+ "learning_rate": 0.0001,
1001
+ "loss": 1.0919,
1002
+ "step": 131
1003
+ },
1004
+ {
1005
+ "epoch": 1.57,
1006
+ "learning_rate": 9.870185768020693e-05,
1007
+ "loss": 1.4658,
1008
+ "step": 132
1009
+ },
1010
+ {
1011
+ "epoch": 1.58,
1012
+ "learning_rate": 9.740393412913219e-05,
1013
+ "loss": 1.1472,
1014
+ "step": 133
1015
+ },
1016
+ {
1017
+ "epoch": 1.6,
1018
+ "learning_rate": 9.610644807862625e-05,
1019
+ "loss": 1.2626,
1020
+ "step": 134
1021
+ },
1022
+ {
1023
+ "epoch": 1.61,
1024
+ "learning_rate": 9.480961818681004e-05,
1025
+ "loss": 1.3915,
1026
+ "step": 135
1027
+ },
1028
+ {
1029
+ "epoch": 1.61,
1030
+ "eval_loss": 1.5905121564865112,
1031
+ "eval_runtime": 2.1919,
1032
+ "eval_samples_per_second": 0.912,
1033
+ "eval_steps_per_second": 0.456,
1034
+ "step": 135
1035
+ },
1036
+ {
1037
+ "epoch": 1.62,
1038
+ "learning_rate": 9.35136630012257e-05,
1039
+ "loss": 1.8036,
1040
+ "step": 136
1041
+ },
1042
+ {
1043
+ "epoch": 1.63,
1044
+ "learning_rate": 9.221880092200601e-05,
1045
+ "loss": 1.1988,
1046
+ "step": 137
1047
+ },
1048
+ {
1049
+ "epoch": 1.64,
1050
+ "learning_rate": 9.092525016506858e-05,
1051
+ "loss": 1.1454,
1052
+ "step": 138
1053
+ },
1054
+ {
1055
+ "epoch": 1.65,
1056
+ "learning_rate": 8.963322872534114e-05,
1057
+ "loss": 1.3185,
1058
+ "step": 139
1059
+ },
1060
+ {
1061
+ "epoch": 1.67,
1062
+ "learning_rate": 8.83429543400241e-05,
1063
+ "loss": 1.6912,
1064
+ "step": 140
1065
+ },
1066
+ {
1067
+ "epoch": 1.67,
1068
+ "eval_loss": 1.5902668237686157,
1069
+ "eval_runtime": 2.1897,
1070
+ "eval_samples_per_second": 0.913,
1071
+ "eval_steps_per_second": 0.457,
1072
+ "step": 140
1073
+ },
1074
+ {
1075
+ "epoch": 1.68,
1076
+ "learning_rate": 8.705464445189647e-05,
1077
+ "loss": 1.6251,
1078
+ "step": 141
1079
+ },
1080
+ {
1081
+ "epoch": 1.69,
1082
+ "learning_rate": 8.57685161726715e-05,
1083
+ "loss": 1.4459,
1084
+ "step": 142
1085
+ },
1086
+ {
1087
+ "epoch": 1.7,
1088
+ "learning_rate": 8.448478624640797e-05,
1089
+ "loss": 1.3483,
1090
+ "step": 143
1091
+ },
1092
+ {
1093
+ "epoch": 1.71,
1094
+ "learning_rate": 8.320367101298351e-05,
1095
+ "loss": 1.7937,
1096
+ "step": 144
1097
+ },
1098
+ {
1099
+ "epoch": 1.73,
1100
+ "learning_rate": 8.192538637163621e-05,
1101
+ "loss": 1.6808,
1102
+ "step": 145
1103
+ },
1104
+ {
1105
+ "epoch": 1.73,
1106
+ "eval_loss": 1.587677240371704,
1107
+ "eval_runtime": 2.1912,
1108
+ "eval_samples_per_second": 0.913,
1109
+ "eval_steps_per_second": 0.456,
1110
+ "step": 145
1111
+ },
1112
+ {
1113
+ "epoch": 1.74,
1114
+ "learning_rate": 8.065014774458003e-05,
1115
+ "loss": 1.453,
1116
+ "step": 146
1117
+ },
1118
+ {
1119
+ "epoch": 1.75,
1120
+ "learning_rate": 7.93781700407012e-05,
1121
+ "loss": 1.3279,
1122
+ "step": 147
1123
+ },
1124
+ {
1125
+ "epoch": 1.76,
1126
+ "learning_rate": 7.810966761934053e-05,
1127
+ "loss": 1.6721,
1128
+ "step": 148
1129
+ },
1130
+ {
1131
+ "epoch": 1.77,
1132
+ "learning_rate": 7.684485425416888e-05,
1133
+ "loss": 1.1307,
1134
+ "step": 149
1135
+ },
1136
+ {
1137
+ "epoch": 1.79,
1138
+ "learning_rate": 7.558394309716088e-05,
1139
+ "loss": 1.249,
1140
+ "step": 150
1141
+ },
1142
+ {
1143
+ "epoch": 1.79,
1144
+ "eval_loss": 1.5859589576721191,
1145
+ "eval_runtime": 2.1868,
1146
+ "eval_samples_per_second": 0.915,
1147
+ "eval_steps_per_second": 0.457,
1148
+ "step": 150
1149
+ },
1150
+ {
1151
+ "epoch": 1.8,
1152
+ "learning_rate": 7.432714664267373e-05,
1153
+ "loss": 1.1872,
1154
+ "step": 151
1155
+ },
1156
+ {
1157
+ "epoch": 1.81,
1158
+ "learning_rate": 7.307467669163655e-05,
1159
+ "loss": 1.4116,
1160
+ "step": 152
1161
+ },
1162
+ {
1163
+ "epoch": 1.82,
1164
+ "learning_rate": 7.182674431585704e-05,
1165
+ "loss": 1.2309,
1166
+ "step": 153
1167
+ },
1168
+ {
1169
+ "epoch": 1.83,
1170
+ "learning_rate": 7.058355982245037e-05,
1171
+ "loss": 1.3953,
1172
+ "step": 154
1173
+ },
1174
+ {
1175
+ "epoch": 1.85,
1176
+ "learning_rate": 6.934533271839752e-05,
1177
+ "loss": 1.43,
1178
+ "step": 155
1179
+ },
1180
+ {
1181
+ "epoch": 1.85,
1182
+ "eval_loss": 1.5868343114852905,
1183
+ "eval_runtime": 2.1915,
1184
+ "eval_samples_per_second": 0.913,
1185
+ "eval_steps_per_second": 0.456,
1186
+ "step": 155
1187
+ },
1188
+ {
1189
+ "epoch": 1.86,
1190
+ "learning_rate": 6.811227167523815e-05,
1191
+ "loss": 1.9049,
1192
+ "step": 156
1193
+ },
1194
+ {
1195
+ "epoch": 1.87,
1196
+ "learning_rate": 6.688458449390437e-05,
1197
+ "loss": 0.8853,
1198
+ "step": 157
1199
+ },
1200
+ {
1201
+ "epoch": 1.88,
1202
+ "learning_rate": 6.566247806970119e-05,
1203
+ "loss": 1.6253,
1204
+ "step": 158
1205
+ },
1206
+ {
1207
+ "epoch": 1.89,
1208
+ "learning_rate": 6.444615835743955e-05,
1209
+ "loss": 1.3031,
1210
+ "step": 159
1211
+ },
1212
+ {
1213
+ "epoch": 1.9,
1214
+ "learning_rate": 6.323583033672799e-05,
1215
+ "loss": 0.8793,
1216
+ "step": 160
1217
+ },
1218
+ {
1219
+ "epoch": 1.9,
1220
+ "eval_loss": 1.5895787477493286,
1221
+ "eval_runtime": 2.1923,
1222
+ "eval_samples_per_second": 0.912,
1223
+ "eval_steps_per_second": 0.456,
1224
+ "step": 160
1225
+ },
1226
+ {
1227
+ "epoch": 1.92,
1228
+ "learning_rate": 6.203169797742861e-05,
1229
+ "loss": 1.3793,
1230
+ "step": 161
1231
+ },
1232
+ {
1233
+ "epoch": 1.93,
1234
+ "learning_rate": 6.083396420528298e-05,
1235
+ "loss": 1.5299,
1236
+ "step": 162
1237
+ },
1238
+ {
1239
+ "epoch": 1.94,
1240
+ "learning_rate": 5.964283086771435e-05,
1241
+ "loss": 1.3525,
1242
+ "step": 163
1243
+ },
1244
+ {
1245
+ "epoch": 1.95,
1246
+ "learning_rate": 5.845849869981137e-05,
1247
+ "loss": 1.4941,
1248
+ "step": 164
1249
+ },
1250
+ {
1251
+ "epoch": 1.96,
1252
+ "learning_rate": 5.728116729049928e-05,
1253
+ "loss": 1.1564,
1254
+ "step": 165
1255
+ },
1256
+ {
1257
+ "epoch": 1.96,
1258
+ "eval_loss": 1.5867228507995605,
1259
+ "eval_runtime": 2.1914,
1260
+ "eval_samples_per_second": 0.913,
1261
+ "eval_steps_per_second": 0.456,
1262
+ "step": 165
1263
+ },
1264
+ {
1265
+ "epoch": 1.98,
1266
+ "learning_rate": 5.611103504890444e-05,
1267
+ "loss": 1.5568,
1268
+ "step": 166
1269
+ },
1270
+ {
1271
+ "epoch": 1.99,
1272
+ "learning_rate": 5.4948299170917325e-05,
1273
+ "loss": 1.2441,
1274
+ "step": 167
1275
+ },
1276
+ {
1277
+ "epoch": 2.0,
1278
+ "learning_rate": 5.379315560596038e-05,
1279
+ "loss": 0.9717,
1280
+ "step": 168
1281
+ },
1282
+ {
1283
+ "epoch": 2.01,
1284
+ "learning_rate": 5.26457990239657e-05,
1285
+ "loss": 1.5905,
1286
+ "step": 169
1287
+ },
1288
+ {
1289
+ "epoch": 2.02,
1290
+ "learning_rate": 5.1506422782568345e-05,
1291
+ "loss": 1.4259,
1292
+ "step": 170
1293
+ },
1294
+ {
1295
+ "epoch": 2.02,
1296
+ "eval_loss": 1.5872297286987305,
1297
+ "eval_runtime": 2.1903,
1298
+ "eval_samples_per_second": 0.913,
1299
+ "eval_steps_per_second": 0.457,
1300
+ "step": 170
1301
+ },
1302
+ {
1303
+ "epoch": 2.04,
1304
+ "learning_rate": 5.0375218894520834e-05,
1305
+ "loss": 1.4877,
1306
+ "step": 171
1307
+ },
1308
+ {
1309
+ "epoch": 2.05,
1310
+ "learning_rate": 4.9252377995334444e-05,
1311
+ "loss": 1.4578,
1312
+ "step": 172
1313
+ },
1314
+ {
1315
+ "epoch": 2.06,
1316
+ "learning_rate": 4.813808931115228e-05,
1317
+ "loss": 1.0967,
1318
+ "step": 173
1319
+ },
1320
+ {
1321
+ "epoch": 2.07,
1322
+ "learning_rate": 4.703254062686017e-05,
1323
+ "loss": 1.3642,
1324
+ "step": 174
1325
+ },
1326
+ {
1327
+ "epoch": 2.08,
1328
+ "learning_rate": 4.593591825444028e-05,
1329
+ "loss": 1.4059,
1330
+ "step": 175
1331
+ },
1332
+ {
1333
+ "epoch": 2.08,
1334
+ "eval_loss": 1.5853421688079834,
1335
+ "eval_runtime": 2.1875,
1336
+ "eval_samples_per_second": 0.914,
1337
+ "eval_steps_per_second": 0.457,
1338
+ "step": 175
1339
+ },
1340
+ {
1341
+ "epoch": 2.1,
1342
+ "learning_rate": 4.484840700157295e-05,
1343
+ "loss": 1.3578,
1344
+ "step": 176
1345
+ },
1346
+ {
1347
+ "epoch": 2.11,
1348
+ "learning_rate": 4.377019014049223e-05,
1349
+ "loss": 1.178,
1350
+ "step": 177
1351
+ },
1352
+ {
1353
+ "epoch": 2.12,
1354
+ "learning_rate": 4.270144937709981e-05,
1355
+ "loss": 1.6276,
1356
+ "step": 178
1357
+ },
1358
+ {
1359
+ "epoch": 2.13,
1360
+ "learning_rate": 4.164236482034327e-05,
1361
+ "loss": 1.5173,
1362
+ "step": 179
1363
+ },
1364
+ {
1365
+ "epoch": 2.14,
1366
+ "learning_rate": 4.059311495186338e-05,
1367
+ "loss": 1.3487,
1368
+ "step": 180
1369
+ },
1370
+ {
1371
+ "epoch": 2.14,
1372
+ "eval_loss": 1.5867466926574707,
1373
+ "eval_runtime": 2.1894,
1374
+ "eval_samples_per_second": 0.913,
1375
+ "eval_steps_per_second": 0.457,
1376
+ "step": 180
1377
+ },
1378
+ {
1379
+ "epoch": 2.15,
1380
+ "learning_rate": 3.9553876595915375e-05,
1381
+ "loss": 1.0898,
1382
+ "step": 181
1383
+ },
1384
+ {
1385
+ "epoch": 2.17,
1386
+ "learning_rate": 3.852482488956992e-05,
1387
+ "loss": 0.8375,
1388
+ "step": 182
1389
+ },
1390
+ {
1391
+ "epoch": 2.18,
1392
+ "learning_rate": 3.750613325319817e-05,
1393
+ "loss": 1.1532,
1394
+ "step": 183
1395
+ },
1396
+ {
1397
+ "epoch": 2.19,
1398
+ "learning_rate": 3.649797336124615e-05,
1399
+ "loss": 1.603,
1400
+ "step": 184
1401
+ },
1402
+ {
1403
+ "epoch": 2.2,
1404
+ "learning_rate": 3.550051511330361e-05,
1405
+ "loss": 1.7306,
1406
+ "step": 185
1407
+ },
1408
+ {
1409
+ "epoch": 2.2,
1410
+ "eval_loss": 1.5883917808532715,
1411
+ "eval_runtime": 2.189,
1412
+ "eval_samples_per_second": 0.914,
1413
+ "eval_steps_per_second": 0.457,
1414
+ "step": 185
1415
+ },
1416
+ {
1417
+ "epoch": 2.21,
1418
+ "learning_rate": 3.45139266054715e-05,
1419
+ "loss": 1.4042,
1420
+ "step": 186
1421
+ },
1422
+ {
1423
+ "epoch": 2.23,
1424
+ "learning_rate": 3.3538374102033866e-05,
1425
+ "loss": 1.1013,
1426
+ "step": 187
1427
+ },
1428
+ {
1429
+ "epoch": 2.24,
1430
+ "learning_rate": 3.257402200743821e-05,
1431
+ "loss": 1.1465,
1432
+ "step": 188
1433
+ },
1434
+ {
1435
+ "epoch": 2.25,
1436
+ "learning_rate": 3.1621032838589305e-05,
1437
+ "loss": 1.7603,
1438
+ "step": 189
1439
+ },
1440
+ {
1441
+ "epoch": 2.26,
1442
+ "learning_rate": 3.0679567197461134e-05,
1443
+ "loss": 1.6117,
1444
+ "step": 190
1445
+ },
1446
+ {
1447
+ "epoch": 2.26,
1448
+ "eval_loss": 1.5858465433120728,
1449
+ "eval_runtime": 2.1875,
1450
+ "eval_samples_per_second": 0.914,
1451
+ "eval_steps_per_second": 0.457,
1452
+ "step": 190
1453
+ },
1454
+ {
1455
+ "epoch": 2.27,
1456
+ "learning_rate": 2.974978374403147e-05,
1457
+ "loss": 1.4448,
1458
+ "step": 191
1459
+ },
1460
+ {
1461
+ "epoch": 2.29,
1462
+ "learning_rate": 2.8831839169543996e-05,
1463
+ "loss": 1.2446,
1464
+ "step": 192
1465
+ },
1466
+ {
1467
+ "epoch": 2.3,
1468
+ "learning_rate": 2.7925888170101665e-05,
1469
+ "loss": 1.2843,
1470
+ "step": 193
1471
+ },
1472
+ {
1473
+ "epoch": 2.31,
1474
+ "learning_rate": 2.7032083420597e-05,
1475
+ "loss": 0.9528,
1476
+ "step": 194
1477
+ },
1478
+ {
1479
+ "epoch": 2.32,
1480
+ "learning_rate": 2.6150575548982292e-05,
1481
+ "loss": 1.1751,
1482
+ "step": 195
1483
+ },
1484
+ {
1485
+ "epoch": 2.32,
1486
+ "eval_loss": 1.5852700471878052,
1487
+ "eval_runtime": 2.1934,
1488
+ "eval_samples_per_second": 0.912,
1489
+ "eval_steps_per_second": 0.456,
1490
+ "step": 195
1491
+ },
1492
+ {
1493
+ "epoch": 2.33,
1494
+ "learning_rate": 2.528151311088537e-05,
1495
+ "loss": 1.2334,
1496
+ "step": 196
1497
+ },
1498
+ {
1499
+ "epoch": 2.35,
1500
+ "learning_rate": 2.4425042564574184e-05,
1501
+ "loss": 1.4127,
1502
+ "step": 197
1503
+ },
1504
+ {
1505
+ "epoch": 2.36,
1506
+ "learning_rate": 2.3581308246275103e-05,
1507
+ "loss": 1.1989,
1508
+ "step": 198
1509
+ },
1510
+ {
1511
+ "epoch": 2.37,
1512
+ "learning_rate": 2.2750452345848682e-05,
1513
+ "loss": 1.0506,
1514
+ "step": 199
1515
+ },
1516
+ {
1517
+ "epoch": 2.38,
1518
+ "learning_rate": 2.1932614882827197e-05,
1519
+ "loss": 1.5642,
1520
+ "step": 200
1521
+ },
1522
+ {
1523
+ "epoch": 2.38,
1524
+ "eval_loss": 1.5846655368804932,
1525
+ "eval_runtime": 2.1905,
1526
+ "eval_samples_per_second": 0.913,
1527
+ "eval_steps_per_second": 0.457,
1528
+ "step": 200
1529
+ }
1530
+ ],
1531
+ "max_steps": 252,
1532
+ "num_train_epochs": 3,
1533
+ "total_flos": 1.0096652091466526e+17,
1534
+ "trial_name": null,
1535
+ "trial_params": null
1536
+ }
checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6262ba37ace7774e5d22c32c9b42a5166ed6929715b00a62cbc99ddcea368d8
3
+ size 3899
checkpoint-220/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-220/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-220/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4154da9f6b0dc38139e13e9d3ad6f2b8ca61c8534670423f1299eabbf04716
3
+ size 11069613
checkpoint-220/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-220/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-220/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4154da9f6b0dc38139e13e9d3ad6f2b8ca61c8534670423f1299eabbf04716
3
+ size 11069613
checkpoint-220/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c70af14440579e8bbcf118a99a41243f3c736d94c77e4504969db0035d9a20f
3
+ size 2852293
checkpoint-220/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0549ffe3f8f11d23149dbabc9ee30eae5ffe34d8592388db70037341ac909988
3
+ size 14575
checkpoint-220/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb65ceb68bd75978ba6d8455e4487b6cbe0ce6cfdb390e21273259762baa179
3
+ size 627
checkpoint-220/trainer_state.json ADDED
@@ -0,0 +1,1688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.619047619047619,
5
+ "global_step": 220,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.6335,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.5176,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.04,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.4883,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.05,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.6,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.06,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.5088,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "eval_loss": 1.7048434019088745,
43
+ "eval_runtime": 2.1875,
44
+ "eval_samples_per_second": 0.914,
45
+ "eval_steps_per_second": 0.457,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.07,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.4985,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.4626,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.1,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.3285,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.11,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.6476,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.12,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.5266,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.12,
80
+ "eval_loss": 1.692796230316162,
81
+ "eval_runtime": 2.1867,
82
+ "eval_samples_per_second": 0.915,
83
+ "eval_steps_per_second": 0.457,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.13,
88
+ "learning_rate": 0.0001999915737775817,
89
+ "loss": 1.6152,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.14,
94
+ "learning_rate": 0.00019996629653035126,
95
+ "loss": 1.505,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.15,
100
+ "learning_rate": 0.00019992417251814282,
101
+ "loss": 1.3107,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.17,
106
+ "learning_rate": 0.00019986520883988232,
107
+ "loss": 1.3979,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.18,
112
+ "learning_rate": 0.0001997894154323911,
113
+ "loss": 1.2276,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.18,
118
+ "eval_loss": 1.6662951707839966,
119
+ "eval_runtime": 2.186,
120
+ "eval_samples_per_second": 0.915,
121
+ "eval_steps_per_second": 0.457,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.19,
126
+ "learning_rate": 0.00019969680506871137,
127
+ "loss": 1.7369,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.2,
132
+ "learning_rate": 0.0001995873933559535,
133
+ "loss": 1.6659,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.21,
138
+ "learning_rate": 0.00019946119873266613,
139
+ "loss": 1.1324,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.23,
144
+ "learning_rate": 0.0001993182424657285,
145
+ "loss": 1.9695,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.24,
150
+ "learning_rate": 0.00019915854864676664,
151
+ "loss": 2.5525,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.24,
156
+ "eval_loss": 1.6528061628341675,
157
+ "eval_runtime": 2.1857,
158
+ "eval_samples_per_second": 0.915,
159
+ "eval_steps_per_second": 0.458,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.25,
164
+ "learning_rate": 0.0001989821441880933,
165
+ "loss": 1.3183,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.26,
170
+ "learning_rate": 0.00019878905881817252,
171
+ "loss": 1.5486,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.27,
176
+ "learning_rate": 0.0001985793250766098,
177
+ "loss": 1.5504,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.29,
182
+ "learning_rate": 0.00019835297830866826,
183
+ "loss": 1.4022,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.3,
188
+ "learning_rate": 0.00019811005665931205,
189
+ "loss": 1.4385,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.3,
194
+ "eval_loss": 1.6313893795013428,
195
+ "eval_runtime": 2.1922,
196
+ "eval_samples_per_second": 0.912,
197
+ "eval_steps_per_second": 0.456,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.31,
202
+ "learning_rate": 0.00019785060106677818,
203
+ "loss": 1.4413,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.32,
208
+ "learning_rate": 0.0001975746552556772,
209
+ "loss": 1.2569,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.33,
214
+ "learning_rate": 0.00019728226572962473,
215
+ "loss": 1.4904,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.35,
220
+ "learning_rate": 0.0001969734817634044,
221
+ "loss": 1.5558,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.36,
226
+ "learning_rate": 0.0001966483553946637,
227
+ "loss": 1.2282,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.36,
232
+ "eval_loss": 1.6210927963256836,
233
+ "eval_runtime": 2.1889,
234
+ "eval_samples_per_second": 0.914,
235
+ "eval_steps_per_second": 0.457,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.37,
240
+ "learning_rate": 0.00019630694141514464,
241
+ "loss": 1.4598,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.38,
246
+ "learning_rate": 0.00019594929736144976,
247
+ "loss": 1.48,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.39,
252
+ "learning_rate": 0.0001955754835053459,
253
+ "loss": 1.3934,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.4,
258
+ "learning_rate": 0.00019518556284360696,
259
+ "loss": 1.1312,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.42,
264
+ "learning_rate": 0.0001947796010873974,
265
+ "loss": 1.6493,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.42,
270
+ "eval_loss": 1.6185638904571533,
271
+ "eval_runtime": 2.1911,
272
+ "eval_samples_per_second": 0.913,
273
+ "eval_steps_per_second": 0.456,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.43,
278
+ "learning_rate": 0.0001943576666511982,
279
+ "loss": 1.587,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.44,
284
+ "learning_rate": 0.0001939198306412775,
285
+ "loss": 1.5798,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.45,
290
+ "learning_rate": 0.0001934661668437073,
291
+ "loss": 1.4308,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.46,
296
+ "learning_rate": 0.0001929967517119289,
297
+ "loss": 1.0766,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.48,
302
+ "learning_rate": 0.0001925116643538684,
303
+ "loss": 2.082,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.48,
308
+ "eval_loss": 1.633681297302246,
309
+ "eval_runtime": 2.1862,
310
+ "eval_samples_per_second": 0.915,
311
+ "eval_steps_per_second": 0.457,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.49,
316
+ "learning_rate": 0.0001920109865186052,
317
+ "loss": 1.8061,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.5,
322
+ "learning_rate": 0.00019149480258259533,
323
+ "loss": 1.4312,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.51,
328
+ "learning_rate": 0.00019096319953545185,
329
+ "loss": 1.737,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.52,
334
+ "learning_rate": 0.00019041626696528503,
335
+ "loss": 1.5035,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.54,
340
+ "learning_rate": 0.00018985409704360456,
341
+ "loss": 1.4689,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.54,
346
+ "eval_loss": 1.6150808334350586,
347
+ "eval_runtime": 2.1952,
348
+ "eval_samples_per_second": 0.911,
349
+ "eval_steps_per_second": 0.456,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.55,
354
+ "learning_rate": 0.0001892767845097864,
355
+ "loss": 1.2483,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.56,
360
+ "learning_rate": 0.00018868442665510678,
361
+ "loss": 1.1436,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.57,
366
+ "learning_rate": 0.00018807712330634642,
367
+ "loss": 1.0488,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.58,
372
+ "learning_rate": 0.00018745497680896722,
373
+ "loss": 1.3745,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 0.6,
378
+ "learning_rate": 0.0001868180920098644,
379
+ "loss": 0.9061,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 0.6,
384
+ "eval_loss": 1.6097811460494995,
385
+ "eval_runtime": 2.1875,
386
+ "eval_samples_per_second": 0.914,
387
+ "eval_steps_per_second": 0.457,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 0.61,
392
+ "learning_rate": 0.0001861665762396974,
393
+ "loss": 1.1305,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 0.62,
398
+ "learning_rate": 0.00018550053929480202,
399
+ "loss": 1.2315,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 0.63,
404
+ "learning_rate": 0.00018482009341868697,
405
+ "loss": 1.4964,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 0.64,
410
+ "learning_rate": 0.00018412535328311814,
411
+ "loss": 1.0928,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 0.65,
416
+ "learning_rate": 0.00018341643596879367,
417
+ "loss": 0.9473,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.65,
422
+ "eval_loss": 1.6084190607070923,
423
+ "eval_runtime": 2.1902,
424
+ "eval_samples_per_second": 0.913,
425
+ "eval_steps_per_second": 0.457,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 0.67,
430
+ "learning_rate": 0.0001826934609456129,
431
+ "loss": 1.362,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 0.68,
436
+ "learning_rate": 0.00018195655005254273,
437
+ "loss": 1.5478,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 0.69,
442
+ "learning_rate": 0.00018120582747708502,
443
+ "loss": 1.4831,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 0.7,
448
+ "learning_rate": 0.00018044141973434758,
449
+ "loss": 1.7483,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 0.71,
454
+ "learning_rate": 0.0001796634556457236,
455
+ "loss": 1.4993,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 0.71,
460
+ "eval_loss": 1.6235202550888062,
461
+ "eval_runtime": 2.1859,
462
+ "eval_samples_per_second": 0.915,
463
+ "eval_steps_per_second": 0.457,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 0.73,
468
+ "learning_rate": 0.00017887206631718203,
469
+ "loss": 1.5076,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 0.74,
474
+ "learning_rate": 0.0001780673851171728,
475
+ "loss": 1.6395,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 0.75,
480
+ "learning_rate": 0.00017724954765415137,
481
+ "loss": 1.6389,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 0.76,
486
+ "learning_rate": 0.00017641869175372493,
487
+ "loss": 1.7769,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 0.77,
492
+ "learning_rate": 0.00017557495743542585,
493
+ "loss": 1.2022,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 0.77,
498
+ "eval_loss": 1.6078369617462158,
499
+ "eval_runtime": 2.1883,
500
+ "eval_samples_per_second": 0.914,
501
+ "eval_steps_per_second": 0.457,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 0.79,
506
+ "learning_rate": 0.00017471848688911464,
507
+ "loss": 1.5265,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 0.8,
512
+ "learning_rate": 0.00017384942445101772,
513
+ "loss": 1.4065,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 0.81,
518
+ "learning_rate": 0.000172967916579403,
519
+ "loss": 1.4326,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 0.82,
524
+ "learning_rate": 0.00017207411182989832,
525
+ "loss": 1.571,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 0.83,
530
+ "learning_rate": 0.00017116816083045602,
531
+ "loss": 1.5233,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 0.83,
536
+ "eval_loss": 1.6035966873168945,
537
+ "eval_runtime": 2.1894,
538
+ "eval_samples_per_second": 0.914,
539
+ "eval_steps_per_second": 0.457,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 0.85,
544
+ "learning_rate": 0.00017025021625596853,
545
+ "loss": 1.5745,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 0.86,
550
+ "learning_rate": 0.0001693204328025389,
551
+ "loss": 1.608,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 0.87,
556
+ "learning_rate": 0.0001683789671614107,
557
+ "loss": 1.4234,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 0.88,
562
+ "learning_rate": 0.00016742597799256182,
563
+ "loss": 1.2839,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 0.89,
568
+ "learning_rate": 0.00016646162589796615,
569
+ "loss": 1.3248,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 0.89,
574
+ "eval_loss": 1.6052225828170776,
575
+ "eval_runtime": 2.1914,
576
+ "eval_samples_per_second": 0.913,
577
+ "eval_steps_per_second": 0.456,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 0.9,
582
+ "learning_rate": 0.00016548607339452853,
583
+ "loss": 1.0683,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 0.92,
588
+ "learning_rate": 0.00016449948488669639,
589
+ "loss": 1.5298,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 0.93,
594
+ "learning_rate": 0.00016350202663875386,
595
+ "loss": 1.5696,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 0.94,
600
+ "learning_rate": 0.00016249386674680184,
601
+ "loss": 1.1743,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 0.95,
606
+ "learning_rate": 0.0001614751751104301,
607
+ "loss": 1.8626,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 0.95,
612
+ "eval_loss": 1.6131467819213867,
613
+ "eval_runtime": 2.1907,
614
+ "eval_samples_per_second": 0.913,
615
+ "eval_steps_per_second": 0.456,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 0.96,
620
+ "learning_rate": 0.00016044612340408466,
621
+ "loss": 1.4832,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 0.98,
626
+ "learning_rate": 0.00015940688504813662,
627
+ "loss": 1.4476,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 0.99,
632
+ "learning_rate": 0.00015835763517965673,
633
+ "loss": 1.3783,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.0,
638
+ "learning_rate": 0.00015729855062290022,
639
+ "loss": 1.6671,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.01,
644
+ "learning_rate": 0.0001562298098595078,
645
+ "loss": 1.4658,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.01,
650
+ "eval_loss": 1.6062182188034058,
651
+ "eval_runtime": 2.1944,
652
+ "eval_samples_per_second": 0.911,
653
+ "eval_steps_per_second": 0.456,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.02,
658
+ "learning_rate": 0.00015515159299842707,
659
+ "loss": 1.64,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.04,
664
+ "learning_rate": 0.00015406408174555976,
665
+ "loss": 1.2125,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.05,
670
+ "learning_rate": 0.00015296745937313987,
671
+ "loss": 1.5001,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.06,
676
+ "learning_rate": 0.00015186191068884775,
677
+ "loss": 1.4294,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.07,
682
+ "learning_rate": 0.00015074762200466556,
683
+ "loss": 1.3162,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.07,
688
+ "eval_loss": 1.5980761051177979,
689
+ "eval_runtime": 2.191,
690
+ "eval_samples_per_second": 0.913,
691
+ "eval_steps_per_second": 0.456,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.08,
696
+ "learning_rate": 0.00014962478110547918,
697
+ "loss": 1.3707,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.1,
702
+ "learning_rate": 0.00014849357721743168,
703
+ "loss": 1.4644,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.11,
708
+ "learning_rate": 0.0001473542009760343,
709
+ "loss": 1.427,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.12,
714
+ "learning_rate": 0.00014620684439403962,
715
+ "loss": 1.3337,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.13,
720
+ "learning_rate": 0.0001450517008290827,
721
+ "loss": 1.4111,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.13,
726
+ "eval_loss": 1.5972555875778198,
727
+ "eval_runtime": 2.1901,
728
+ "eval_samples_per_second": 0.913,
729
+ "eval_steps_per_second": 0.457,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.14,
734
+ "learning_rate": 0.0001438889649510956,
735
+ "loss": 1.441,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.15,
740
+ "learning_rate": 0.00014271883270950073,
741
+ "loss": 0.7424,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.17,
746
+ "learning_rate": 0.00014154150130018866,
747
+ "loss": 1.3582,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.18,
752
+ "learning_rate": 0.00014035716913228568,
753
+ "loss": 1.3479,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 1.19,
758
+ "learning_rate": 0.00013916603579471705,
759
+ "loss": 1.2211,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 1.19,
764
+ "eval_loss": 1.6037462949752808,
765
+ "eval_runtime": 2.1932,
766
+ "eval_samples_per_second": 0.912,
767
+ "eval_steps_per_second": 0.456,
768
+ "step": 100
769
+ },
770
+ {
771
+ "epoch": 1.2,
772
+ "learning_rate": 0.0001379683020225714,
773
+ "loss": 2.0387,
774
+ "step": 101
775
+ },
776
+ {
777
+ "epoch": 1.21,
778
+ "learning_rate": 0.000136764169663272,
779
+ "loss": 1.3237,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 1.23,
784
+ "learning_rate": 0.00013555384164256048,
785
+ "loss": 1.4286,
786
+ "step": 103
787
+ },
788
+ {
789
+ "epoch": 1.24,
790
+ "learning_rate": 0.00013433752193029886,
791
+ "loss": 1.7905,
792
+ "step": 104
793
+ },
794
+ {
795
+ "epoch": 1.25,
796
+ "learning_rate": 0.00013311541550609565,
797
+ "loss": 1.7277,
798
+ "step": 105
799
+ },
800
+ {
801
+ "epoch": 1.25,
802
+ "eval_loss": 1.5989317893981934,
803
+ "eval_runtime": 2.1896,
804
+ "eval_samples_per_second": 0.913,
805
+ "eval_steps_per_second": 0.457,
806
+ "step": 105
807
+ },
808
+ {
809
+ "epoch": 1.26,
810
+ "learning_rate": 0.00013188772832476188,
811
+ "loss": 1.5016,
812
+ "step": 106
813
+ },
814
+ {
815
+ "epoch": 1.27,
816
+ "learning_rate": 0.00013065466728160252,
817
+ "loss": 1.7159,
818
+ "step": 107
819
+ },
820
+ {
821
+ "epoch": 1.29,
822
+ "learning_rate": 0.00012941644017754964,
823
+ "loss": 1.2701,
824
+ "step": 108
825
+ },
826
+ {
827
+ "epoch": 1.3,
828
+ "learning_rate": 0.00012817325568414297,
829
+ "loss": 1.4085,
830
+ "step": 109
831
+ },
832
+ {
833
+ "epoch": 1.31,
834
+ "learning_rate": 0.00012692532330836346,
835
+ "loss": 1.246,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 1.31,
840
+ "eval_loss": 1.597010850906372,
841
+ "eval_runtime": 2.1881,
842
+ "eval_samples_per_second": 0.914,
843
+ "eval_steps_per_second": 0.457,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 1.32,
848
+ "learning_rate": 0.00012567285335732633,
849
+ "loss": 1.3382,
850
+ "step": 111
851
+ },
852
+ {
853
+ "epoch": 1.33,
854
+ "learning_rate": 0.00012441605690283915,
855
+ "loss": 0.9305,
856
+ "step": 112
857
+ },
858
+ {
859
+ "epoch": 1.35,
860
+ "learning_rate": 0.00012315514574583113,
861
+ "loss": 1.388,
862
+ "step": 113
863
+ },
864
+ {
865
+ "epoch": 1.36,
866
+ "learning_rate": 0.0001218903323806595,
867
+ "loss": 1.2634,
868
+ "step": 114
869
+ },
870
+ {
871
+ "epoch": 1.37,
872
+ "learning_rate": 0.00012062182995929882,
873
+ "loss": 1.1971,
874
+ "step": 115
875
+ },
876
+ {
877
+ "epoch": 1.37,
878
+ "eval_loss": 1.5930073261260986,
879
+ "eval_runtime": 2.1881,
880
+ "eval_samples_per_second": 0.914,
881
+ "eval_steps_per_second": 0.457,
882
+ "step": 115
883
+ },
884
+ {
885
+ "epoch": 1.38,
886
+ "learning_rate": 0.00011934985225541998,
887
+ "loss": 1.2645,
888
+ "step": 116
889
+ },
890
+ {
891
+ "epoch": 1.39,
892
+ "learning_rate": 0.0001180746136283638,
893
+ "loss": 1.6775,
894
+ "step": 117
895
+ },
896
+ {
897
+ "epoch": 1.4,
898
+ "learning_rate": 0.00011679632898701649,
899
+ "loss": 1.018,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 1.42,
904
+ "learning_rate": 0.00011551521375359206,
905
+ "loss": 1.225,
906
+ "step": 119
907
+ },
908
+ {
909
+ "epoch": 1.43,
910
+ "learning_rate": 0.00011423148382732853,
911
+ "loss": 1.166,
912
+ "step": 120
913
+ },
914
+ {
915
+ "epoch": 1.43,
916
+ "eval_loss": 1.593321681022644,
917
+ "eval_runtime": 2.1858,
918
+ "eval_samples_per_second": 0.915,
919
+ "eval_steps_per_second": 0.457,
920
+ "step": 120
921
+ },
922
+ {
923
+ "epoch": 1.44,
924
+ "learning_rate": 0.00011294535554810354,
925
+ "loss": 1.7995,
926
+ "step": 121
927
+ },
928
+ {
929
+ "epoch": 1.45,
930
+ "learning_rate": 0.00011165704565997593,
931
+ "loss": 0.7254,
932
+ "step": 122
933
+ },
934
+ {
935
+ "epoch": 1.46,
936
+ "learning_rate": 0.00011036677127465889,
937
+ "loss": 1.4558,
938
+ "step": 123
939
+ },
940
+ {
941
+ "epoch": 1.48,
942
+ "learning_rate": 0.00010907474983493144,
943
+ "loss": 1.5358,
944
+ "step": 124
945
+ },
946
+ {
947
+ "epoch": 1.49,
948
+ "learning_rate": 0.00010778119907799398,
949
+ "loss": 1.5007,
950
+ "step": 125
951
+ },
952
+ {
953
+ "epoch": 1.49,
954
+ "eval_loss": 1.5938643217086792,
955
+ "eval_runtime": 2.189,
956
+ "eval_samples_per_second": 0.914,
957
+ "eval_steps_per_second": 0.457,
958
+ "step": 125
959
+ },
960
+ {
961
+ "epoch": 1.5,
962
+ "learning_rate": 0.0001064863369987743,
963
+ "loss": 1.6357,
964
+ "step": 126
965
+ },
966
+ {
967
+ "epoch": 1.51,
968
+ "learning_rate": 0.00010519038181318999,
969
+ "loss": 1.7524,
970
+ "step": 127
971
+ },
972
+ {
973
+ "epoch": 1.52,
974
+ "learning_rate": 0.00010389355192137377,
975
+ "loss": 1.6955,
976
+ "step": 128
977
+ },
978
+ {
979
+ "epoch": 1.54,
980
+ "learning_rate": 0.00010259606587086783,
981
+ "loss": 1.4174,
982
+ "step": 129
983
+ },
984
+ {
985
+ "epoch": 1.55,
986
+ "learning_rate": 0.0001012981423197931,
987
+ "loss": 1.2135,
988
+ "step": 130
989
+ },
990
+ {
991
+ "epoch": 1.55,
992
+ "eval_loss": 1.5910111665725708,
993
+ "eval_runtime": 2.1873,
994
+ "eval_samples_per_second": 0.914,
995
+ "eval_steps_per_second": 0.457,
996
+ "step": 130
997
+ },
998
+ {
999
+ "epoch": 1.56,
1000
+ "learning_rate": 0.0001,
1001
+ "loss": 1.0919,
1002
+ "step": 131
1003
+ },
1004
+ {
1005
+ "epoch": 1.57,
1006
+ "learning_rate": 9.870185768020693e-05,
1007
+ "loss": 1.4658,
1008
+ "step": 132
1009
+ },
1010
+ {
1011
+ "epoch": 1.58,
1012
+ "learning_rate": 9.740393412913219e-05,
1013
+ "loss": 1.1472,
1014
+ "step": 133
1015
+ },
1016
+ {
1017
+ "epoch": 1.6,
1018
+ "learning_rate": 9.610644807862625e-05,
1019
+ "loss": 1.2626,
1020
+ "step": 134
1021
+ },
1022
+ {
1023
+ "epoch": 1.61,
1024
+ "learning_rate": 9.480961818681004e-05,
1025
+ "loss": 1.3915,
1026
+ "step": 135
1027
+ },
1028
+ {
1029
+ "epoch": 1.61,
1030
+ "eval_loss": 1.5905121564865112,
1031
+ "eval_runtime": 2.1919,
1032
+ "eval_samples_per_second": 0.912,
1033
+ "eval_steps_per_second": 0.456,
1034
+ "step": 135
1035
+ },
1036
+ {
1037
+ "epoch": 1.62,
1038
+ "learning_rate": 9.35136630012257e-05,
1039
+ "loss": 1.8036,
1040
+ "step": 136
1041
+ },
1042
+ {
1043
+ "epoch": 1.63,
1044
+ "learning_rate": 9.221880092200601e-05,
1045
+ "loss": 1.1988,
1046
+ "step": 137
1047
+ },
1048
+ {
1049
+ "epoch": 1.64,
1050
+ "learning_rate": 9.092525016506858e-05,
1051
+ "loss": 1.1454,
1052
+ "step": 138
1053
+ },
1054
+ {
1055
+ "epoch": 1.65,
1056
+ "learning_rate": 8.963322872534114e-05,
1057
+ "loss": 1.3185,
1058
+ "step": 139
1059
+ },
1060
+ {
1061
+ "epoch": 1.67,
1062
+ "learning_rate": 8.83429543400241e-05,
1063
+ "loss": 1.6912,
1064
+ "step": 140
1065
+ },
1066
+ {
1067
+ "epoch": 1.67,
1068
+ "eval_loss": 1.5902668237686157,
1069
+ "eval_runtime": 2.1897,
1070
+ "eval_samples_per_second": 0.913,
1071
+ "eval_steps_per_second": 0.457,
1072
+ "step": 140
1073
+ },
1074
+ {
1075
+ "epoch": 1.68,
1076
+ "learning_rate": 8.705464445189647e-05,
1077
+ "loss": 1.6251,
1078
+ "step": 141
1079
+ },
1080
+ {
1081
+ "epoch": 1.69,
1082
+ "learning_rate": 8.57685161726715e-05,
1083
+ "loss": 1.4459,
1084
+ "step": 142
1085
+ },
1086
+ {
1087
+ "epoch": 1.7,
1088
+ "learning_rate": 8.448478624640797e-05,
1089
+ "loss": 1.3483,
1090
+ "step": 143
1091
+ },
1092
+ {
1093
+ "epoch": 1.71,
1094
+ "learning_rate": 8.320367101298351e-05,
1095
+ "loss": 1.7937,
1096
+ "step": 144
1097
+ },
1098
+ {
1099
+ "epoch": 1.73,
1100
+ "learning_rate": 8.192538637163621e-05,
1101
+ "loss": 1.6808,
1102
+ "step": 145
1103
+ },
1104
+ {
1105
+ "epoch": 1.73,
1106
+ "eval_loss": 1.587677240371704,
1107
+ "eval_runtime": 2.1912,
1108
+ "eval_samples_per_second": 0.913,
1109
+ "eval_steps_per_second": 0.456,
1110
+ "step": 145
1111
+ },
1112
+ {
1113
+ "epoch": 1.74,
1114
+ "learning_rate": 8.065014774458003e-05,
1115
+ "loss": 1.453,
1116
+ "step": 146
1117
+ },
1118
+ {
1119
+ "epoch": 1.75,
1120
+ "learning_rate": 7.93781700407012e-05,
1121
+ "loss": 1.3279,
1122
+ "step": 147
1123
+ },
1124
+ {
1125
+ "epoch": 1.76,
1126
+ "learning_rate": 7.810966761934053e-05,
1127
+ "loss": 1.6721,
1128
+ "step": 148
1129
+ },
1130
+ {
1131
+ "epoch": 1.77,
1132
+ "learning_rate": 7.684485425416888e-05,
1133
+ "loss": 1.1307,
1134
+ "step": 149
1135
+ },
1136
+ {
1137
+ "epoch": 1.79,
1138
+ "learning_rate": 7.558394309716088e-05,
1139
+ "loss": 1.249,
1140
+ "step": 150
1141
+ },
1142
+ {
1143
+ "epoch": 1.79,
1144
+ "eval_loss": 1.5859589576721191,
1145
+ "eval_runtime": 2.1868,
1146
+ "eval_samples_per_second": 0.915,
1147
+ "eval_steps_per_second": 0.457,
1148
+ "step": 150
1149
+ },
1150
+ {
1151
+ "epoch": 1.8,
1152
+ "learning_rate": 7.432714664267373e-05,
1153
+ "loss": 1.1872,
1154
+ "step": 151
1155
+ },
1156
+ {
1157
+ "epoch": 1.81,
1158
+ "learning_rate": 7.307467669163655e-05,
1159
+ "loss": 1.4116,
1160
+ "step": 152
1161
+ },
1162
+ {
1163
+ "epoch": 1.82,
1164
+ "learning_rate": 7.182674431585704e-05,
1165
+ "loss": 1.2309,
1166
+ "step": 153
1167
+ },
1168
+ {
1169
+ "epoch": 1.83,
1170
+ "learning_rate": 7.058355982245037e-05,
1171
+ "loss": 1.3953,
1172
+ "step": 154
1173
+ },
1174
+ {
1175
+ "epoch": 1.85,
1176
+ "learning_rate": 6.934533271839752e-05,
1177
+ "loss": 1.43,
1178
+ "step": 155
1179
+ },
1180
+ {
1181
+ "epoch": 1.85,
1182
+ "eval_loss": 1.5868343114852905,
1183
+ "eval_runtime": 2.1915,
1184
+ "eval_samples_per_second": 0.913,
1185
+ "eval_steps_per_second": 0.456,
1186
+ "step": 155
1187
+ },
1188
+ {
1189
+ "epoch": 1.86,
1190
+ "learning_rate": 6.811227167523815e-05,
1191
+ "loss": 1.9049,
1192
+ "step": 156
1193
+ },
1194
+ {
1195
+ "epoch": 1.87,
1196
+ "learning_rate": 6.688458449390437e-05,
1197
+ "loss": 0.8853,
1198
+ "step": 157
1199
+ },
1200
+ {
1201
+ "epoch": 1.88,
1202
+ "learning_rate": 6.566247806970119e-05,
1203
+ "loss": 1.6253,
1204
+ "step": 158
1205
+ },
1206
+ {
1207
+ "epoch": 1.89,
1208
+ "learning_rate": 6.444615835743955e-05,
1209
+ "loss": 1.3031,
1210
+ "step": 159
1211
+ },
1212
+ {
1213
+ "epoch": 1.9,
1214
+ "learning_rate": 6.323583033672799e-05,
1215
+ "loss": 0.8793,
1216
+ "step": 160
1217
+ },
1218
+ {
1219
+ "epoch": 1.9,
1220
+ "eval_loss": 1.5895787477493286,
1221
+ "eval_runtime": 2.1923,
1222
+ "eval_samples_per_second": 0.912,
1223
+ "eval_steps_per_second": 0.456,
1224
+ "step": 160
1225
+ },
1226
+ {
1227
+ "epoch": 1.92,
1228
+ "learning_rate": 6.203169797742861e-05,
1229
+ "loss": 1.3793,
1230
+ "step": 161
1231
+ },
1232
+ {
1233
+ "epoch": 1.93,
1234
+ "learning_rate": 6.083396420528298e-05,
1235
+ "loss": 1.5299,
1236
+ "step": 162
1237
+ },
1238
+ {
1239
+ "epoch": 1.94,
1240
+ "learning_rate": 5.964283086771435e-05,
1241
+ "loss": 1.3525,
1242
+ "step": 163
1243
+ },
1244
+ {
1245
+ "epoch": 1.95,
1246
+ "learning_rate": 5.845849869981137e-05,
1247
+ "loss": 1.4941,
1248
+ "step": 164
1249
+ },
1250
+ {
1251
+ "epoch": 1.96,
1252
+ "learning_rate": 5.728116729049928e-05,
1253
+ "loss": 1.1564,
1254
+ "step": 165
1255
+ },
1256
+ {
1257
+ "epoch": 1.96,
1258
+ "eval_loss": 1.5867228507995605,
1259
+ "eval_runtime": 2.1914,
1260
+ "eval_samples_per_second": 0.913,
1261
+ "eval_steps_per_second": 0.456,
1262
+ "step": 165
1263
+ },
1264
+ {
1265
+ "epoch": 1.98,
1266
+ "learning_rate": 5.611103504890444e-05,
1267
+ "loss": 1.5568,
1268
+ "step": 166
1269
+ },
1270
+ {
1271
+ "epoch": 1.99,
1272
+ "learning_rate": 5.4948299170917325e-05,
1273
+ "loss": 1.2441,
1274
+ "step": 167
1275
+ },
1276
+ {
1277
+ "epoch": 2.0,
1278
+ "learning_rate": 5.379315560596038e-05,
1279
+ "loss": 0.9717,
1280
+ "step": 168
1281
+ },
1282
+ {
1283
+ "epoch": 2.01,
1284
+ "learning_rate": 5.26457990239657e-05,
1285
+ "loss": 1.5905,
1286
+ "step": 169
1287
+ },
1288
+ {
1289
+ "epoch": 2.02,
1290
+ "learning_rate": 5.1506422782568345e-05,
1291
+ "loss": 1.4259,
1292
+ "step": 170
1293
+ },
1294
+ {
1295
+ "epoch": 2.02,
1296
+ "eval_loss": 1.5872297286987305,
1297
+ "eval_runtime": 2.1903,
1298
+ "eval_samples_per_second": 0.913,
1299
+ "eval_steps_per_second": 0.457,
1300
+ "step": 170
1301
+ },
1302
+ {
1303
+ "epoch": 2.04,
1304
+ "learning_rate": 5.0375218894520834e-05,
1305
+ "loss": 1.4877,
1306
+ "step": 171
1307
+ },
1308
+ {
1309
+ "epoch": 2.05,
1310
+ "learning_rate": 4.9252377995334444e-05,
1311
+ "loss": 1.4578,
1312
+ "step": 172
1313
+ },
1314
+ {
1315
+ "epoch": 2.06,
1316
+ "learning_rate": 4.813808931115228e-05,
1317
+ "loss": 1.0967,
1318
+ "step": 173
1319
+ },
1320
+ {
1321
+ "epoch": 2.07,
1322
+ "learning_rate": 4.703254062686017e-05,
1323
+ "loss": 1.3642,
1324
+ "step": 174
1325
+ },
1326
+ {
1327
+ "epoch": 2.08,
1328
+ "learning_rate": 4.593591825444028e-05,
1329
+ "loss": 1.4059,
1330
+ "step": 175
1331
+ },
1332
+ {
1333
+ "epoch": 2.08,
1334
+ "eval_loss": 1.5853421688079834,
1335
+ "eval_runtime": 2.1875,
1336
+ "eval_samples_per_second": 0.914,
1337
+ "eval_steps_per_second": 0.457,
1338
+ "step": 175
1339
+ },
1340
+ {
1341
+ "epoch": 2.1,
1342
+ "learning_rate": 4.484840700157295e-05,
1343
+ "loss": 1.3578,
1344
+ "step": 176
1345
+ },
1346
+ {
1347
+ "epoch": 2.11,
1348
+ "learning_rate": 4.377019014049223e-05,
1349
+ "loss": 1.178,
1350
+ "step": 177
1351
+ },
1352
+ {
1353
+ "epoch": 2.12,
1354
+ "learning_rate": 4.270144937709981e-05,
1355
+ "loss": 1.6276,
1356
+ "step": 178
1357
+ },
1358
+ {
1359
+ "epoch": 2.13,
1360
+ "learning_rate": 4.164236482034327e-05,
1361
+ "loss": 1.5173,
1362
+ "step": 179
1363
+ },
1364
+ {
1365
+ "epoch": 2.14,
1366
+ "learning_rate": 4.059311495186338e-05,
1367
+ "loss": 1.3487,
1368
+ "step": 180
1369
+ },
1370
+ {
1371
+ "epoch": 2.14,
1372
+ "eval_loss": 1.5867466926574707,
1373
+ "eval_runtime": 2.1894,
1374
+ "eval_samples_per_second": 0.913,
1375
+ "eval_steps_per_second": 0.457,
1376
+ "step": 180
1377
+ },
1378
+ {
1379
+ "epoch": 2.15,
1380
+ "learning_rate": 3.9553876595915375e-05,
1381
+ "loss": 1.0898,
1382
+ "step": 181
1383
+ },
1384
+ {
1385
+ "epoch": 2.17,
1386
+ "learning_rate": 3.852482488956992e-05,
1387
+ "loss": 0.8375,
1388
+ "step": 182
1389
+ },
1390
+ {
1391
+ "epoch": 2.18,
1392
+ "learning_rate": 3.750613325319817e-05,
1393
+ "loss": 1.1532,
1394
+ "step": 183
1395
+ },
1396
+ {
1397
+ "epoch": 2.19,
1398
+ "learning_rate": 3.649797336124615e-05,
1399
+ "loss": 1.603,
1400
+ "step": 184
1401
+ },
1402
+ {
1403
+ "epoch": 2.2,
1404
+ "learning_rate": 3.550051511330361e-05,
1405
+ "loss": 1.7306,
1406
+ "step": 185
1407
+ },
1408
+ {
1409
+ "epoch": 2.2,
1410
+ "eval_loss": 1.5883917808532715,
1411
+ "eval_runtime": 2.189,
1412
+ "eval_samples_per_second": 0.914,
1413
+ "eval_steps_per_second": 0.457,
1414
+ "step": 185
1415
+ },
1416
+ {
1417
+ "epoch": 2.21,
1418
+ "learning_rate": 3.45139266054715e-05,
1419
+ "loss": 1.4042,
1420
+ "step": 186
1421
+ },
1422
+ {
1423
+ "epoch": 2.23,
1424
+ "learning_rate": 3.3538374102033866e-05,
1425
+ "loss": 1.1013,
1426
+ "step": 187
1427
+ },
1428
+ {
1429
+ "epoch": 2.24,
1430
+ "learning_rate": 3.257402200743821e-05,
1431
+ "loss": 1.1465,
1432
+ "step": 188
1433
+ },
1434
+ {
1435
+ "epoch": 2.25,
1436
+ "learning_rate": 3.1621032838589305e-05,
1437
+ "loss": 1.7603,
1438
+ "step": 189
1439
+ },
1440
+ {
1441
+ "epoch": 2.26,
1442
+ "learning_rate": 3.0679567197461134e-05,
1443
+ "loss": 1.6117,
1444
+ "step": 190
1445
+ },
1446
+ {
1447
+ "epoch": 2.26,
1448
+ "eval_loss": 1.5858465433120728,
1449
+ "eval_runtime": 2.1875,
1450
+ "eval_samples_per_second": 0.914,
1451
+ "eval_steps_per_second": 0.457,
1452
+ "step": 190
1453
+ },
1454
+ {
1455
+ "epoch": 2.27,
1456
+ "learning_rate": 2.974978374403147e-05,
1457
+ "loss": 1.4448,
1458
+ "step": 191
1459
+ },
1460
+ {
1461
+ "epoch": 2.29,
1462
+ "learning_rate": 2.8831839169543996e-05,
1463
+ "loss": 1.2446,
1464
+ "step": 192
1465
+ },
1466
+ {
1467
+ "epoch": 2.3,
1468
+ "learning_rate": 2.7925888170101665e-05,
1469
+ "loss": 1.2843,
1470
+ "step": 193
1471
+ },
1472
+ {
1473
+ "epoch": 2.31,
1474
+ "learning_rate": 2.7032083420597e-05,
1475
+ "loss": 0.9528,
1476
+ "step": 194
1477
+ },
1478
+ {
1479
+ "epoch": 2.32,
1480
+ "learning_rate": 2.6150575548982292e-05,
1481
+ "loss": 1.1751,
1482
+ "step": 195
1483
+ },
1484
+ {
1485
+ "epoch": 2.32,
1486
+ "eval_loss": 1.5852700471878052,
1487
+ "eval_runtime": 2.1934,
1488
+ "eval_samples_per_second": 0.912,
1489
+ "eval_steps_per_second": 0.456,
1490
+ "step": 195
1491
+ },
1492
+ {
1493
+ "epoch": 2.33,
1494
+ "learning_rate": 2.528151311088537e-05,
1495
+ "loss": 1.2334,
1496
+ "step": 196
1497
+ },
1498
+ {
1499
+ "epoch": 2.35,
1500
+ "learning_rate": 2.4425042564574184e-05,
1501
+ "loss": 1.4127,
1502
+ "step": 197
1503
+ },
1504
+ {
1505
+ "epoch": 2.36,
1506
+ "learning_rate": 2.3581308246275103e-05,
1507
+ "loss": 1.1989,
1508
+ "step": 198
1509
+ },
1510
+ {
1511
+ "epoch": 2.37,
1512
+ "learning_rate": 2.2750452345848682e-05,
1513
+ "loss": 1.0506,
1514
+ "step": 199
1515
+ },
1516
+ {
1517
+ "epoch": 2.38,
1518
+ "learning_rate": 2.1932614882827197e-05,
1519
+ "loss": 1.5642,
1520
+ "step": 200
1521
+ },
1522
+ {
1523
+ "epoch": 2.38,
1524
+ "eval_loss": 1.5846655368804932,
1525
+ "eval_runtime": 2.1905,
1526
+ "eval_samples_per_second": 0.913,
1527
+ "eval_steps_per_second": 0.457,
1528
+ "step": 200
1529
+ },
1530
+ {
1531
+ "epoch": 2.39,
1532
+ "learning_rate": 2.112793368281799e-05,
1533
+ "loss": 0.9502,
1534
+ "step": 201
1535
+ },
1536
+ {
1537
+ "epoch": 2.4,
1538
+ "learning_rate": 2.03365443542764e-05,
1539
+ "loss": 1.5232,
1540
+ "step": 202
1541
+ },
1542
+ {
1543
+ "epoch": 2.42,
1544
+ "learning_rate": 1.9558580265652448e-05,
1545
+ "loss": 1.213,
1546
+ "step": 203
1547
+ },
1548
+ {
1549
+ "epoch": 2.43,
1550
+ "learning_rate": 1.879417252291502e-05,
1551
+ "loss": 0.7273,
1552
+ "step": 204
1553
+ },
1554
+ {
1555
+ "epoch": 2.44,
1556
+ "learning_rate": 1.804344994745727e-05,
1557
+ "loss": 1.6215,
1558
+ "step": 205
1559
+ },
1560
+ {
1561
+ "epoch": 2.44,
1562
+ "eval_loss": 1.5852676630020142,
1563
+ "eval_runtime": 2.1897,
1564
+ "eval_samples_per_second": 0.913,
1565
+ "eval_steps_per_second": 0.457,
1566
+ "step": 205
1567
+ },
1568
+ {
1569
+ "epoch": 2.45,
1570
+ "learning_rate": 1.730653905438714e-05,
1571
+ "loss": 1.5882,
1572
+ "step": 206
1573
+ },
1574
+ {
1575
+ "epoch": 2.46,
1576
+ "learning_rate": 1.6583564031206357e-05,
1577
+ "loss": 1.2459,
1578
+ "step": 207
1579
+ },
1580
+ {
1581
+ "epoch": 2.48,
1582
+ "learning_rate": 1.587464671688187e-05,
1583
+ "loss": 1.45,
1584
+ "step": 208
1585
+ },
1586
+ {
1587
+ "epoch": 2.49,
1588
+ "learning_rate": 1.5179906581313064e-05,
1589
+ "loss": 1.4953,
1590
+ "step": 209
1591
+ },
1592
+ {
1593
+ "epoch": 2.5,
1594
+ "learning_rate": 1.4499460705197998e-05,
1595
+ "loss": 0.9686,
1596
+ "step": 210
1597
+ },
1598
+ {
1599
+ "epoch": 2.5,
1600
+ "eval_loss": 1.5858163833618164,
1601
+ "eval_runtime": 2.1895,
1602
+ "eval_samples_per_second": 0.913,
1603
+ "eval_steps_per_second": 0.457,
1604
+ "step": 210
1605
+ },
1606
+ {
1607
+ "epoch": 2.51,
1608
+ "learning_rate": 1.3833423760302611e-05,
1609
+ "loss": 1.458,
1610
+ "step": 211
1611
+ },
1612
+ {
1613
+ "epoch": 2.52,
1614
+ "learning_rate": 1.3181907990135622e-05,
1615
+ "loss": 1.9958,
1616
+ "step": 212
1617
+ },
1618
+ {
1619
+ "epoch": 2.54,
1620
+ "learning_rate": 1.2545023191032801e-05,
1621
+ "loss": 1.2623,
1622
+ "step": 213
1623
+ },
1624
+ {
1625
+ "epoch": 2.55,
1626
+ "learning_rate": 1.1922876693653585e-05,
1627
+ "loss": 1.3237,
1628
+ "step": 214
1629
+ },
1630
+ {
1631
+ "epoch": 2.56,
1632
+ "learning_rate": 1.131557334489326e-05,
1633
+ "loss": 1.5978,
1634
+ "step": 215
1635
+ },
1636
+ {
1637
+ "epoch": 2.56,
1638
+ "eval_loss": 1.5844168663024902,
1639
+ "eval_runtime": 2.1926,
1640
+ "eval_samples_per_second": 0.912,
1641
+ "eval_steps_per_second": 0.456,
1642
+ "step": 215
1643
+ },
1644
+ {
1645
+ "epoch": 2.57,
1646
+ "learning_rate": 1.0723215490213634e-05,
1647
+ "loss": 1.5103,
1648
+ "step": 216
1649
+ },
1650
+ {
1651
+ "epoch": 2.58,
1652
+ "learning_rate": 1.0145902956395447e-05,
1653
+ "loss": 1.3013,
1654
+ "step": 217
1655
+ },
1656
+ {
1657
+ "epoch": 2.6,
1658
+ "learning_rate": 9.583733034714981e-06,
1659
+ "loss": 1.1485,
1660
+ "step": 218
1661
+ },
1662
+ {
1663
+ "epoch": 2.61,
1664
+ "learning_rate": 9.036800464548157e-06,
1665
+ "loss": 1.4783,
1666
+ "step": 219
1667
+ },
1668
+ {
1669
+ "epoch": 2.62,
1670
+ "learning_rate": 8.505197417404687e-06,
1671
+ "loss": 1.6576,
1672
+ "step": 220
1673
+ },
1674
+ {
1675
+ "epoch": 2.62,
1676
+ "eval_loss": 1.5853140354156494,
1677
+ "eval_runtime": 2.1872,
1678
+ "eval_samples_per_second": 0.914,
1679
+ "eval_steps_per_second": 0.457,
1680
+ "step": 220
1681
+ }
1682
+ ],
1683
+ "max_steps": 252,
1684
+ "num_train_epochs": 3,
1685
+ "total_flos": 1.1096609062255819e+17,
1686
+ "trial_name": null,
1687
+ "trial_params": null
1688
+ }
checkpoint-220/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6262ba37ace7774e5d22c32c9b42a5166ed6929715b00a62cbc99ddcea368d8
3
+ size 3899
checkpoint-240/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-240/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-240/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c241dbdac8932bf3d055e26d180c7696f742ff7c9f77d5f0b3938e7481965f1
3
+ size 11069613
checkpoint-240/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-240/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "EleutherAI/gpt-j-6b",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-240/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c241dbdac8932bf3d055e26d180c7696f742ff7c9f77d5f0b3938e7481965f1
3
+ size 11069613
checkpoint-240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9635c43f72a8fba009dc8d9aea4d5a99ed33bf003bff94b9ff3a08cad5ffa58
3
+ size 2852293
checkpoint-240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0dfb171332ffdf5185078b01e6c5599c2aa6657e6b96cd2596b1da1f86c2d1f
3
+ size 14575
checkpoint-240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f54f6ddf6e298e00cf08b8aba69e868580139653d74aedbca7693d8a8deb566
3
+ size 627
checkpoint-240/trainer_state.json ADDED
@@ -0,0 +1,1840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.857142857142857,
5
+ "global_step": 240,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.01,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.6335,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.02,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.5176,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.04,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.4883,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.05,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.6,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.06,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.5088,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "eval_loss": 1.7048434019088745,
43
+ "eval_runtime": 2.1875,
44
+ "eval_samples_per_second": 0.914,
45
+ "eval_steps_per_second": 0.457,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.07,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.4985,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.08,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.4626,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.1,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.3285,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.11,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.6476,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.12,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.5266,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.12,
80
+ "eval_loss": 1.692796230316162,
81
+ "eval_runtime": 2.1867,
82
+ "eval_samples_per_second": 0.915,
83
+ "eval_steps_per_second": 0.457,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.13,
88
+ "learning_rate": 0.0001999915737775817,
89
+ "loss": 1.6152,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.14,
94
+ "learning_rate": 0.00019996629653035126,
95
+ "loss": 1.505,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.15,
100
+ "learning_rate": 0.00019992417251814282,
101
+ "loss": 1.3107,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.17,
106
+ "learning_rate": 0.00019986520883988232,
107
+ "loss": 1.3979,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.18,
112
+ "learning_rate": 0.0001997894154323911,
113
+ "loss": 1.2276,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.18,
118
+ "eval_loss": 1.6662951707839966,
119
+ "eval_runtime": 2.186,
120
+ "eval_samples_per_second": 0.915,
121
+ "eval_steps_per_second": 0.457,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.19,
126
+ "learning_rate": 0.00019969680506871137,
127
+ "loss": 1.7369,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.2,
132
+ "learning_rate": 0.0001995873933559535,
133
+ "loss": 1.6659,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.21,
138
+ "learning_rate": 0.00019946119873266613,
139
+ "loss": 1.1324,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.23,
144
+ "learning_rate": 0.0001993182424657285,
145
+ "loss": 1.9695,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.24,
150
+ "learning_rate": 0.00019915854864676664,
151
+ "loss": 2.5525,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.24,
156
+ "eval_loss": 1.6528061628341675,
157
+ "eval_runtime": 2.1857,
158
+ "eval_samples_per_second": 0.915,
159
+ "eval_steps_per_second": 0.458,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.25,
164
+ "learning_rate": 0.0001989821441880933,
165
+ "loss": 1.3183,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.26,
170
+ "learning_rate": 0.00019878905881817252,
171
+ "loss": 1.5486,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.27,
176
+ "learning_rate": 0.0001985793250766098,
177
+ "loss": 1.5504,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.29,
182
+ "learning_rate": 0.00019835297830866826,
183
+ "loss": 1.4022,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.3,
188
+ "learning_rate": 0.00019811005665931205,
189
+ "loss": 1.4385,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.3,
194
+ "eval_loss": 1.6313893795013428,
195
+ "eval_runtime": 2.1922,
196
+ "eval_samples_per_second": 0.912,
197
+ "eval_steps_per_second": 0.456,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.31,
202
+ "learning_rate": 0.00019785060106677818,
203
+ "loss": 1.4413,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.32,
208
+ "learning_rate": 0.0001975746552556772,
209
+ "loss": 1.2569,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.33,
214
+ "learning_rate": 0.00019728226572962473,
215
+ "loss": 1.4904,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.35,
220
+ "learning_rate": 0.0001969734817634044,
221
+ "loss": 1.5558,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.36,
226
+ "learning_rate": 0.0001966483553946637,
227
+ "loss": 1.2282,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.36,
232
+ "eval_loss": 1.6210927963256836,
233
+ "eval_runtime": 2.1889,
234
+ "eval_samples_per_second": 0.914,
235
+ "eval_steps_per_second": 0.457,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.37,
240
+ "learning_rate": 0.00019630694141514464,
241
+ "loss": 1.4598,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.38,
246
+ "learning_rate": 0.00019594929736144976,
247
+ "loss": 1.48,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.39,
252
+ "learning_rate": 0.0001955754835053459,
253
+ "loss": 1.3934,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.4,
258
+ "learning_rate": 0.00019518556284360696,
259
+ "loss": 1.1312,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.42,
264
+ "learning_rate": 0.0001947796010873974,
265
+ "loss": 1.6493,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.42,
270
+ "eval_loss": 1.6185638904571533,
271
+ "eval_runtime": 2.1911,
272
+ "eval_samples_per_second": 0.913,
273
+ "eval_steps_per_second": 0.456,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.43,
278
+ "learning_rate": 0.0001943576666511982,
279
+ "loss": 1.587,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.44,
284
+ "learning_rate": 0.0001939198306412775,
285
+ "loss": 1.5798,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.45,
290
+ "learning_rate": 0.0001934661668437073,
291
+ "loss": 1.4308,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.46,
296
+ "learning_rate": 0.0001929967517119289,
297
+ "loss": 1.0766,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.48,
302
+ "learning_rate": 0.0001925116643538684,
303
+ "loss": 2.082,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.48,
308
+ "eval_loss": 1.633681297302246,
309
+ "eval_runtime": 2.1862,
310
+ "eval_samples_per_second": 0.915,
311
+ "eval_steps_per_second": 0.457,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.49,
316
+ "learning_rate": 0.0001920109865186052,
317
+ "loss": 1.8061,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.5,
322
+ "learning_rate": 0.00019149480258259533,
323
+ "loss": 1.4312,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.51,
328
+ "learning_rate": 0.00019096319953545185,
329
+ "loss": 1.737,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.52,
334
+ "learning_rate": 0.00019041626696528503,
335
+ "loss": 1.5035,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.54,
340
+ "learning_rate": 0.00018985409704360456,
341
+ "loss": 1.4689,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.54,
346
+ "eval_loss": 1.6150808334350586,
347
+ "eval_runtime": 2.1952,
348
+ "eval_samples_per_second": 0.911,
349
+ "eval_steps_per_second": 0.456,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.55,
354
+ "learning_rate": 0.0001892767845097864,
355
+ "loss": 1.2483,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.56,
360
+ "learning_rate": 0.00018868442665510678,
361
+ "loss": 1.1436,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.57,
366
+ "learning_rate": 0.00018807712330634642,
367
+ "loss": 1.0488,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.58,
372
+ "learning_rate": 0.00018745497680896722,
373
+ "loss": 1.3745,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 0.6,
378
+ "learning_rate": 0.0001868180920098644,
379
+ "loss": 0.9061,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 0.6,
384
+ "eval_loss": 1.6097811460494995,
385
+ "eval_runtime": 2.1875,
386
+ "eval_samples_per_second": 0.914,
387
+ "eval_steps_per_second": 0.457,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 0.61,
392
+ "learning_rate": 0.0001861665762396974,
393
+ "loss": 1.1305,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 0.62,
398
+ "learning_rate": 0.00018550053929480202,
399
+ "loss": 1.2315,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 0.63,
404
+ "learning_rate": 0.00018482009341868697,
405
+ "loss": 1.4964,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 0.64,
410
+ "learning_rate": 0.00018412535328311814,
411
+ "loss": 1.0928,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 0.65,
416
+ "learning_rate": 0.00018341643596879367,
417
+ "loss": 0.9473,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.65,
422
+ "eval_loss": 1.6084190607070923,
423
+ "eval_runtime": 2.1902,
424
+ "eval_samples_per_second": 0.913,
425
+ "eval_steps_per_second": 0.457,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 0.67,
430
+ "learning_rate": 0.0001826934609456129,
431
+ "loss": 1.362,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 0.68,
436
+ "learning_rate": 0.00018195655005254273,
437
+ "loss": 1.5478,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 0.69,
442
+ "learning_rate": 0.00018120582747708502,
443
+ "loss": 1.4831,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 0.7,
448
+ "learning_rate": 0.00018044141973434758,
449
+ "loss": 1.7483,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 0.71,
454
+ "learning_rate": 0.0001796634556457236,
455
+ "loss": 1.4993,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 0.71,
460
+ "eval_loss": 1.6235202550888062,
461
+ "eval_runtime": 2.1859,
462
+ "eval_samples_per_second": 0.915,
463
+ "eval_steps_per_second": 0.457,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 0.73,
468
+ "learning_rate": 0.00017887206631718203,
469
+ "loss": 1.5076,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 0.74,
474
+ "learning_rate": 0.0001780673851171728,
475
+ "loss": 1.6395,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 0.75,
480
+ "learning_rate": 0.00017724954765415137,
481
+ "loss": 1.6389,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 0.76,
486
+ "learning_rate": 0.00017641869175372493,
487
+ "loss": 1.7769,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 0.77,
492
+ "learning_rate": 0.00017557495743542585,
493
+ "loss": 1.2022,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 0.77,
498
+ "eval_loss": 1.6078369617462158,
499
+ "eval_runtime": 2.1883,
500
+ "eval_samples_per_second": 0.914,
501
+ "eval_steps_per_second": 0.457,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 0.79,
506
+ "learning_rate": 0.00017471848688911464,
507
+ "loss": 1.5265,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 0.8,
512
+ "learning_rate": 0.00017384942445101772,
513
+ "loss": 1.4065,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 0.81,
518
+ "learning_rate": 0.000172967916579403,
519
+ "loss": 1.4326,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 0.82,
524
+ "learning_rate": 0.00017207411182989832,
525
+ "loss": 1.571,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 0.83,
530
+ "learning_rate": 0.00017116816083045602,
531
+ "loss": 1.5233,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 0.83,
536
+ "eval_loss": 1.6035966873168945,
537
+ "eval_runtime": 2.1894,
538
+ "eval_samples_per_second": 0.914,
539
+ "eval_steps_per_second": 0.457,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 0.85,
544
+ "learning_rate": 0.00017025021625596853,
545
+ "loss": 1.5745,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 0.86,
550
+ "learning_rate": 0.0001693204328025389,
551
+ "loss": 1.608,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 0.87,
556
+ "learning_rate": 0.0001683789671614107,
557
+ "loss": 1.4234,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 0.88,
562
+ "learning_rate": 0.00016742597799256182,
563
+ "loss": 1.2839,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 0.89,
568
+ "learning_rate": 0.00016646162589796615,
569
+ "loss": 1.3248,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 0.89,
574
+ "eval_loss": 1.6052225828170776,
575
+ "eval_runtime": 2.1914,
576
+ "eval_samples_per_second": 0.913,
577
+ "eval_steps_per_second": 0.456,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 0.9,
582
+ "learning_rate": 0.00016548607339452853,
583
+ "loss": 1.0683,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 0.92,
588
+ "learning_rate": 0.00016449948488669639,
589
+ "loss": 1.5298,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 0.93,
594
+ "learning_rate": 0.00016350202663875386,
595
+ "loss": 1.5696,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 0.94,
600
+ "learning_rate": 0.00016249386674680184,
601
+ "loss": 1.1743,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 0.95,
606
+ "learning_rate": 0.0001614751751104301,
607
+ "loss": 1.8626,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 0.95,
612
+ "eval_loss": 1.6131467819213867,
613
+ "eval_runtime": 2.1907,
614
+ "eval_samples_per_second": 0.913,
615
+ "eval_steps_per_second": 0.456,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 0.96,
620
+ "learning_rate": 0.00016044612340408466,
621
+ "loss": 1.4832,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 0.98,
626
+ "learning_rate": 0.00015940688504813662,
627
+ "loss": 1.4476,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 0.99,
632
+ "learning_rate": 0.00015835763517965673,
633
+ "loss": 1.3783,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.0,
638
+ "learning_rate": 0.00015729855062290022,
639
+ "loss": 1.6671,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.01,
644
+ "learning_rate": 0.0001562298098595078,
645
+ "loss": 1.4658,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.01,
650
+ "eval_loss": 1.6062182188034058,
651
+ "eval_runtime": 2.1944,
652
+ "eval_samples_per_second": 0.911,
653
+ "eval_steps_per_second": 0.456,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.02,
658
+ "learning_rate": 0.00015515159299842707,
659
+ "loss": 1.64,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.04,
664
+ "learning_rate": 0.00015406408174555976,
665
+ "loss": 1.2125,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.05,
670
+ "learning_rate": 0.00015296745937313987,
671
+ "loss": 1.5001,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.06,
676
+ "learning_rate": 0.00015186191068884775,
677
+ "loss": 1.4294,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.07,
682
+ "learning_rate": 0.00015074762200466556,
683
+ "loss": 1.3162,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.07,
688
+ "eval_loss": 1.5980761051177979,
689
+ "eval_runtime": 2.191,
690
+ "eval_samples_per_second": 0.913,
691
+ "eval_steps_per_second": 0.456,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.08,
696
+ "learning_rate": 0.00014962478110547918,
697
+ "loss": 1.3707,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.1,
702
+ "learning_rate": 0.00014849357721743168,
703
+ "loss": 1.4644,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.11,
708
+ "learning_rate": 0.0001473542009760343,
709
+ "loss": 1.427,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.12,
714
+ "learning_rate": 0.00014620684439403962,
715
+ "loss": 1.3337,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.13,
720
+ "learning_rate": 0.0001450517008290827,
721
+ "loss": 1.4111,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.13,
726
+ "eval_loss": 1.5972555875778198,
727
+ "eval_runtime": 2.1901,
728
+ "eval_samples_per_second": 0.913,
729
+ "eval_steps_per_second": 0.457,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.14,
734
+ "learning_rate": 0.0001438889649510956,
735
+ "loss": 1.441,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.15,
740
+ "learning_rate": 0.00014271883270950073,
741
+ "loss": 0.7424,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.17,
746
+ "learning_rate": 0.00014154150130018866,
747
+ "loss": 1.3582,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.18,
752
+ "learning_rate": 0.00014035716913228568,
753
+ "loss": 1.3479,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 1.19,
758
+ "learning_rate": 0.00013916603579471705,
759
+ "loss": 1.2211,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 1.19,
764
+ "eval_loss": 1.6037462949752808,
765
+ "eval_runtime": 2.1932,
766
+ "eval_samples_per_second": 0.912,
767
+ "eval_steps_per_second": 0.456,
768
+ "step": 100
769
+ },
770
+ {
771
+ "epoch": 1.2,
772
+ "learning_rate": 0.0001379683020225714,
773
+ "loss": 2.0387,
774
+ "step": 101
775
+ },
776
+ {
777
+ "epoch": 1.21,
778
+ "learning_rate": 0.000136764169663272,
779
+ "loss": 1.3237,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 1.23,
784
+ "learning_rate": 0.00013555384164256048,
785
+ "loss": 1.4286,
786
+ "step": 103
787
+ },
788
+ {
789
+ "epoch": 1.24,
790
+ "learning_rate": 0.00013433752193029886,
791
+ "loss": 1.7905,
792
+ "step": 104
793
+ },
794
+ {
795
+ "epoch": 1.25,
796
+ "learning_rate": 0.00013311541550609565,
797
+ "loss": 1.7277,
798
+ "step": 105
799
+ },
800
+ {
801
+ "epoch": 1.25,
802
+ "eval_loss": 1.5989317893981934,
803
+ "eval_runtime": 2.1896,
804
+ "eval_samples_per_second": 0.913,
805
+ "eval_steps_per_second": 0.457,
806
+ "step": 105
807
+ },
808
+ {
809
+ "epoch": 1.26,
810
+ "learning_rate": 0.00013188772832476188,
811
+ "loss": 1.5016,
812
+ "step": 106
813
+ },
814
+ {
815
+ "epoch": 1.27,
816
+ "learning_rate": 0.00013065466728160252,
817
+ "loss": 1.7159,
818
+ "step": 107
819
+ },
820
+ {
821
+ "epoch": 1.29,
822
+ "learning_rate": 0.00012941644017754964,
823
+ "loss": 1.2701,
824
+ "step": 108
825
+ },
826
+ {
827
+ "epoch": 1.3,
828
+ "learning_rate": 0.00012817325568414297,
829
+ "loss": 1.4085,
830
+ "step": 109
831
+ },
832
+ {
833
+ "epoch": 1.31,
834
+ "learning_rate": 0.00012692532330836346,
835
+ "loss": 1.246,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 1.31,
840
+ "eval_loss": 1.597010850906372,
841
+ "eval_runtime": 2.1881,
842
+ "eval_samples_per_second": 0.914,
843
+ "eval_steps_per_second": 0.457,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 1.32,
848
+ "learning_rate": 0.00012567285335732633,
849
+ "loss": 1.3382,
850
+ "step": 111
851
+ },
852
+ {
853
+ "epoch": 1.33,
854
+ "learning_rate": 0.00012441605690283915,
855
+ "loss": 0.9305,
856
+ "step": 112
857
+ },
858
+ {
859
+ "epoch": 1.35,
860
+ "learning_rate": 0.00012315514574583113,
861
+ "loss": 1.388,
862
+ "step": 113
863
+ },
864
+ {
865
+ "epoch": 1.36,
866
+ "learning_rate": 0.0001218903323806595,
867
+ "loss": 1.2634,
868
+ "step": 114
869
+ },
870
+ {
871
+ "epoch": 1.37,
872
+ "learning_rate": 0.00012062182995929882,
873
+ "loss": 1.1971,
874
+ "step": 115
875
+ },
876
+ {
877
+ "epoch": 1.37,
878
+ "eval_loss": 1.5930073261260986,
879
+ "eval_runtime": 2.1881,
880
+ "eval_samples_per_second": 0.914,
881
+ "eval_steps_per_second": 0.457,
882
+ "step": 115
883
+ },
884
+ {
885
+ "epoch": 1.38,
886
+ "learning_rate": 0.00011934985225541998,
887
+ "loss": 1.2645,
888
+ "step": 116
889
+ },
890
+ {
891
+ "epoch": 1.39,
892
+ "learning_rate": 0.0001180746136283638,
893
+ "loss": 1.6775,
894
+ "step": 117
895
+ },
896
+ {
897
+ "epoch": 1.4,
898
+ "learning_rate": 0.00011679632898701649,
899
+ "loss": 1.018,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 1.42,
904
+ "learning_rate": 0.00011551521375359206,
905
+ "loss": 1.225,
906
+ "step": 119
907
+ },
908
+ {
909
+ "epoch": 1.43,
910
+ "learning_rate": 0.00011423148382732853,
911
+ "loss": 1.166,
912
+ "step": 120
913
+ },
914
+ {
915
+ "epoch": 1.43,
916
+ "eval_loss": 1.593321681022644,
917
+ "eval_runtime": 2.1858,
918
+ "eval_samples_per_second": 0.915,
919
+ "eval_steps_per_second": 0.457,
920
+ "step": 120
921
+ },
922
+ {
923
+ "epoch": 1.44,
924
+ "learning_rate": 0.00011294535554810354,
925
+ "loss": 1.7995,
926
+ "step": 121
927
+ },
928
+ {
929
+ "epoch": 1.45,
930
+ "learning_rate": 0.00011165704565997593,
931
+ "loss": 0.7254,
932
+ "step": 122
933
+ },
934
+ {
935
+ "epoch": 1.46,
936
+ "learning_rate": 0.00011036677127465889,
937
+ "loss": 1.4558,
938
+ "step": 123
939
+ },
940
+ {
941
+ "epoch": 1.48,
942
+ "learning_rate": 0.00010907474983493144,
943
+ "loss": 1.5358,
944
+ "step": 124
945
+ },
946
+ {
947
+ "epoch": 1.49,
948
+ "learning_rate": 0.00010778119907799398,
949
+ "loss": 1.5007,
950
+ "step": 125
951
+ },
952
+ {
953
+ "epoch": 1.49,
954
+ "eval_loss": 1.5938643217086792,
955
+ "eval_runtime": 2.189,
956
+ "eval_samples_per_second": 0.914,
957
+ "eval_steps_per_second": 0.457,
958
+ "step": 125
959
+ },
960
+ {
961
+ "epoch": 1.5,
962
+ "learning_rate": 0.0001064863369987743,
963
+ "loss": 1.6357,
964
+ "step": 126
965
+ },
966
+ {
967
+ "epoch": 1.51,
968
+ "learning_rate": 0.00010519038181318999,
969
+ "loss": 1.7524,
970
+ "step": 127
971
+ },
972
+ {
973
+ "epoch": 1.52,
974
+ "learning_rate": 0.00010389355192137377,
975
+ "loss": 1.6955,
976
+ "step": 128
977
+ },
978
+ {
979
+ "epoch": 1.54,
980
+ "learning_rate": 0.00010259606587086783,
981
+ "loss": 1.4174,
982
+ "step": 129
983
+ },
984
+ {
985
+ "epoch": 1.55,
986
+ "learning_rate": 0.0001012981423197931,
987
+ "loss": 1.2135,
988
+ "step": 130
989
+ },
990
+ {
991
+ "epoch": 1.55,
992
+ "eval_loss": 1.5910111665725708,
993
+ "eval_runtime": 2.1873,
994
+ "eval_samples_per_second": 0.914,
995
+ "eval_steps_per_second": 0.457,
996
+ "step": 130
997
+ },
998
+ {
999
+ "epoch": 1.56,
1000
+ "learning_rate": 0.0001,
1001
+ "loss": 1.0919,
1002
+ "step": 131
1003
+ },
1004
+ {
1005
+ "epoch": 1.57,
1006
+ "learning_rate": 9.870185768020693e-05,
1007
+ "loss": 1.4658,
1008
+ "step": 132
1009
+ },
1010
+ {
1011
+ "epoch": 1.58,
1012
+ "learning_rate": 9.740393412913219e-05,
1013
+ "loss": 1.1472,
1014
+ "step": 133
1015
+ },
1016
+ {
1017
+ "epoch": 1.6,
1018
+ "learning_rate": 9.610644807862625e-05,
1019
+ "loss": 1.2626,
1020
+ "step": 134
1021
+ },
1022
+ {
1023
+ "epoch": 1.61,
1024
+ "learning_rate": 9.480961818681004e-05,
1025
+ "loss": 1.3915,
1026
+ "step": 135
1027
+ },
1028
+ {
1029
+ "epoch": 1.61,
1030
+ "eval_loss": 1.5905121564865112,
1031
+ "eval_runtime": 2.1919,
1032
+ "eval_samples_per_second": 0.912,
1033
+ "eval_steps_per_second": 0.456,
1034
+ "step": 135
1035
+ },
1036
+ {
1037
+ "epoch": 1.62,
1038
+ "learning_rate": 9.35136630012257e-05,
1039
+ "loss": 1.8036,
1040
+ "step": 136
1041
+ },
1042
+ {
1043
+ "epoch": 1.63,
1044
+ "learning_rate": 9.221880092200601e-05,
1045
+ "loss": 1.1988,
1046
+ "step": 137
1047
+ },
1048
+ {
1049
+ "epoch": 1.64,
1050
+ "learning_rate": 9.092525016506858e-05,
1051
+ "loss": 1.1454,
1052
+ "step": 138
1053
+ },
1054
+ {
1055
+ "epoch": 1.65,
1056
+ "learning_rate": 8.963322872534114e-05,
1057
+ "loss": 1.3185,
1058
+ "step": 139
1059
+ },
1060
+ {
1061
+ "epoch": 1.67,
1062
+ "learning_rate": 8.83429543400241e-05,
1063
+ "loss": 1.6912,
1064
+ "step": 140
1065
+ },
1066
+ {
1067
+ "epoch": 1.67,
1068
+ "eval_loss": 1.5902668237686157,
1069
+ "eval_runtime": 2.1897,
1070
+ "eval_samples_per_second": 0.913,
1071
+ "eval_steps_per_second": 0.457,
1072
+ "step": 140
1073
+ },
1074
+ {
1075
+ "epoch": 1.68,
1076
+ "learning_rate": 8.705464445189647e-05,
1077
+ "loss": 1.6251,
1078
+ "step": 141
1079
+ },
1080
+ {
1081
+ "epoch": 1.69,
1082
+ "learning_rate": 8.57685161726715e-05,
1083
+ "loss": 1.4459,
1084
+ "step": 142
1085
+ },
1086
+ {
1087
+ "epoch": 1.7,
1088
+ "learning_rate": 8.448478624640797e-05,
1089
+ "loss": 1.3483,
1090
+ "step": 143
1091
+ },
1092
+ {
1093
+ "epoch": 1.71,
1094
+ "learning_rate": 8.320367101298351e-05,
1095
+ "loss": 1.7937,
1096
+ "step": 144
1097
+ },
1098
+ {
1099
+ "epoch": 1.73,
1100
+ "learning_rate": 8.192538637163621e-05,
1101
+ "loss": 1.6808,
1102
+ "step": 145
1103
+ },
1104
+ {
1105
+ "epoch": 1.73,
1106
+ "eval_loss": 1.587677240371704,
1107
+ "eval_runtime": 2.1912,
1108
+ "eval_samples_per_second": 0.913,
1109
+ "eval_steps_per_second": 0.456,
1110
+ "step": 145
1111
+ },
1112
+ {
1113
+ "epoch": 1.74,
1114
+ "learning_rate": 8.065014774458003e-05,
1115
+ "loss": 1.453,
1116
+ "step": 146
1117
+ },
1118
+ {
1119
+ "epoch": 1.75,
1120
+ "learning_rate": 7.93781700407012e-05,
1121
+ "loss": 1.3279,
1122
+ "step": 147
1123
+ },
1124
+ {
1125
+ "epoch": 1.76,
1126
+ "learning_rate": 7.810966761934053e-05,
1127
+ "loss": 1.6721,
1128
+ "step": 148
1129
+ },
1130
+ {
1131
+ "epoch": 1.77,
1132
+ "learning_rate": 7.684485425416888e-05,
1133
+ "loss": 1.1307,
1134
+ "step": 149
1135
+ },
1136
+ {
1137
+ "epoch": 1.79,
1138
+ "learning_rate": 7.558394309716088e-05,
1139
+ "loss": 1.249,
1140
+ "step": 150
1141
+ },
1142
+ {
1143
+ "epoch": 1.79,
1144
+ "eval_loss": 1.5859589576721191,
1145
+ "eval_runtime": 2.1868,
1146
+ "eval_samples_per_second": 0.915,
1147
+ "eval_steps_per_second": 0.457,
1148
+ "step": 150
1149
+ },
1150
+ {
1151
+ "epoch": 1.8,
1152
+ "learning_rate": 7.432714664267373e-05,
1153
+ "loss": 1.1872,
1154
+ "step": 151
1155
+ },
1156
+ {
1157
+ "epoch": 1.81,
1158
+ "learning_rate": 7.307467669163655e-05,
1159
+ "loss": 1.4116,
1160
+ "step": 152
1161
+ },
1162
+ {
1163
+ "epoch": 1.82,
1164
+ "learning_rate": 7.182674431585704e-05,
1165
+ "loss": 1.2309,
1166
+ "step": 153
1167
+ },
1168
+ {
1169
+ "epoch": 1.83,
1170
+ "learning_rate": 7.058355982245037e-05,
1171
+ "loss": 1.3953,
1172
+ "step": 154
1173
+ },
1174
+ {
1175
+ "epoch": 1.85,
1176
+ "learning_rate": 6.934533271839752e-05,
1177
+ "loss": 1.43,
1178
+ "step": 155
1179
+ },
1180
+ {
1181
+ "epoch": 1.85,
1182
+ "eval_loss": 1.5868343114852905,
1183
+ "eval_runtime": 2.1915,
1184
+ "eval_samples_per_second": 0.913,
1185
+ "eval_steps_per_second": 0.456,
1186
+ "step": 155
1187
+ },
1188
+ {
1189
+ "epoch": 1.86,
1190
+ "learning_rate": 6.811227167523815e-05,
1191
+ "loss": 1.9049,
1192
+ "step": 156
1193
+ },
1194
+ {
1195
+ "epoch": 1.87,
1196
+ "learning_rate": 6.688458449390437e-05,
1197
+ "loss": 0.8853,
1198
+ "step": 157
1199
+ },
1200
+ {
1201
+ "epoch": 1.88,
1202
+ "learning_rate": 6.566247806970119e-05,
1203
+ "loss": 1.6253,
1204
+ "step": 158
1205
+ },
1206
+ {
1207
+ "epoch": 1.89,
1208
+ "learning_rate": 6.444615835743955e-05,
1209
+ "loss": 1.3031,
1210
+ "step": 159
1211
+ },
1212
+ {
1213
+ "epoch": 1.9,
1214
+ "learning_rate": 6.323583033672799e-05,
1215
+ "loss": 0.8793,
1216
+ "step": 160
1217
+ },
1218
+ {
1219
+ "epoch": 1.9,
1220
+ "eval_loss": 1.5895787477493286,
1221
+ "eval_runtime": 2.1923,
1222
+ "eval_samples_per_second": 0.912,
1223
+ "eval_steps_per_second": 0.456,
1224
+ "step": 160
1225
+ },
1226
+ {
1227
+ "epoch": 1.92,
1228
+ "learning_rate": 6.203169797742861e-05,
1229
+ "loss": 1.3793,
1230
+ "step": 161
1231
+ },
1232
+ {
1233
+ "epoch": 1.93,
1234
+ "learning_rate": 6.083396420528298e-05,
1235
+ "loss": 1.5299,
1236
+ "step": 162
1237
+ },
1238
+ {
1239
+ "epoch": 1.94,
1240
+ "learning_rate": 5.964283086771435e-05,
1241
+ "loss": 1.3525,
1242
+ "step": 163
1243
+ },
1244
+ {
1245
+ "epoch": 1.95,
1246
+ "learning_rate": 5.845849869981137e-05,
1247
+ "loss": 1.4941,
1248
+ "step": 164
1249
+ },
1250
+ {
1251
+ "epoch": 1.96,
1252
+ "learning_rate": 5.728116729049928e-05,
1253
+ "loss": 1.1564,
1254
+ "step": 165
1255
+ },
1256
+ {
1257
+ "epoch": 1.96,
1258
+ "eval_loss": 1.5867228507995605,
1259
+ "eval_runtime": 2.1914,
1260
+ "eval_samples_per_second": 0.913,
1261
+ "eval_steps_per_second": 0.456,
1262
+ "step": 165
1263
+ },
1264
+ {
1265
+ "epoch": 1.98,
1266
+ "learning_rate": 5.611103504890444e-05,
1267
+ "loss": 1.5568,
1268
+ "step": 166
1269
+ },
1270
+ {
1271
+ "epoch": 1.99,
1272
+ "learning_rate": 5.4948299170917325e-05,
1273
+ "loss": 1.2441,
1274
+ "step": 167
1275
+ },
1276
+ {
1277
+ "epoch": 2.0,
1278
+ "learning_rate": 5.379315560596038e-05,
1279
+ "loss": 0.9717,
1280
+ "step": 168
1281
+ },
1282
+ {
1283
+ "epoch": 2.01,
1284
+ "learning_rate": 5.26457990239657e-05,
1285
+ "loss": 1.5905,
1286
+ "step": 169
1287
+ },
1288
+ {
1289
+ "epoch": 2.02,
1290
+ "learning_rate": 5.1506422782568345e-05,
1291
+ "loss": 1.4259,
1292
+ "step": 170
1293
+ },
1294
+ {
1295
+ "epoch": 2.02,
1296
+ "eval_loss": 1.5872297286987305,
1297
+ "eval_runtime": 2.1903,
1298
+ "eval_samples_per_second": 0.913,
1299
+ "eval_steps_per_second": 0.457,
1300
+ "step": 170
1301
+ },
1302
+ {
1303
+ "epoch": 2.04,
1304
+ "learning_rate": 5.0375218894520834e-05,
1305
+ "loss": 1.4877,
1306
+ "step": 171
1307
+ },
1308
+ {
1309
+ "epoch": 2.05,
1310
+ "learning_rate": 4.9252377995334444e-05,
1311
+ "loss": 1.4578,
1312
+ "step": 172
1313
+ },
1314
+ {
1315
+ "epoch": 2.06,
1316
+ "learning_rate": 4.813808931115228e-05,
1317
+ "loss": 1.0967,
1318
+ "step": 173
1319
+ },
1320
+ {
1321
+ "epoch": 2.07,
1322
+ "learning_rate": 4.703254062686017e-05,
1323
+ "loss": 1.3642,
1324
+ "step": 174
1325
+ },
1326
+ {
1327
+ "epoch": 2.08,
1328
+ "learning_rate": 4.593591825444028e-05,
1329
+ "loss": 1.4059,
1330
+ "step": 175
1331
+ },
1332
+ {
1333
+ "epoch": 2.08,
1334
+ "eval_loss": 1.5853421688079834,
1335
+ "eval_runtime": 2.1875,
1336
+ "eval_samples_per_second": 0.914,
1337
+ "eval_steps_per_second": 0.457,
1338
+ "step": 175
1339
+ },
1340
+ {
1341
+ "epoch": 2.1,
1342
+ "learning_rate": 4.484840700157295e-05,
1343
+ "loss": 1.3578,
1344
+ "step": 176
1345
+ },
1346
+ {
1347
+ "epoch": 2.11,
1348
+ "learning_rate": 4.377019014049223e-05,
1349
+ "loss": 1.178,
1350
+ "step": 177
1351
+ },
1352
+ {
1353
+ "epoch": 2.12,
1354
+ "learning_rate": 4.270144937709981e-05,
1355
+ "loss": 1.6276,
1356
+ "step": 178
1357
+ },
1358
+ {
1359
+ "epoch": 2.13,
1360
+ "learning_rate": 4.164236482034327e-05,
1361
+ "loss": 1.5173,
1362
+ "step": 179
1363
+ },
1364
+ {
1365
+ "epoch": 2.14,
1366
+ "learning_rate": 4.059311495186338e-05,
1367
+ "loss": 1.3487,
1368
+ "step": 180
1369
+ },
1370
+ {
1371
+ "epoch": 2.14,
1372
+ "eval_loss": 1.5867466926574707,
1373
+ "eval_runtime": 2.1894,
1374
+ "eval_samples_per_second": 0.913,
1375
+ "eval_steps_per_second": 0.457,
1376
+ "step": 180
1377
+ },
1378
+ {
1379
+ "epoch": 2.15,
1380
+ "learning_rate": 3.9553876595915375e-05,
1381
+ "loss": 1.0898,
1382
+ "step": 181
1383
+ },
1384
+ {
1385
+ "epoch": 2.17,
1386
+ "learning_rate": 3.852482488956992e-05,
1387
+ "loss": 0.8375,
1388
+ "step": 182
1389
+ },
1390
+ {
1391
+ "epoch": 2.18,
1392
+ "learning_rate": 3.750613325319817e-05,
1393
+ "loss": 1.1532,
1394
+ "step": 183
1395
+ },
1396
+ {
1397
+ "epoch": 2.19,
1398
+ "learning_rate": 3.649797336124615e-05,
1399
+ "loss": 1.603,
1400
+ "step": 184
1401
+ },
1402
+ {
1403
+ "epoch": 2.2,
1404
+ "learning_rate": 3.550051511330361e-05,
1405
+ "loss": 1.7306,
1406
+ "step": 185
1407
+ },
1408
+ {
1409
+ "epoch": 2.2,
1410
+ "eval_loss": 1.5883917808532715,
1411
+ "eval_runtime": 2.189,
1412
+ "eval_samples_per_second": 0.914,
1413
+ "eval_steps_per_second": 0.457,
1414
+ "step": 185
1415
+ },
1416
+ {
1417
+ "epoch": 2.21,
1418
+ "learning_rate": 3.45139266054715e-05,
1419
+ "loss": 1.4042,
1420
+ "step": 186
1421
+ },
1422
+ {
1423
+ "epoch": 2.23,
1424
+ "learning_rate": 3.3538374102033866e-05,
1425
+ "loss": 1.1013,
1426
+ "step": 187
1427
+ },
1428
+ {
1429
+ "epoch": 2.24,
1430
+ "learning_rate": 3.257402200743821e-05,
1431
+ "loss": 1.1465,
1432
+ "step": 188
1433
+ },
1434
+ {
1435
+ "epoch": 2.25,
1436
+ "learning_rate": 3.1621032838589305e-05,
1437
+ "loss": 1.7603,
1438
+ "step": 189
1439
+ },
1440
+ {
1441
+ "epoch": 2.26,
1442
+ "learning_rate": 3.0679567197461134e-05,
1443
+ "loss": 1.6117,
1444
+ "step": 190
1445
+ },
1446
+ {
1447
+ "epoch": 2.26,
1448
+ "eval_loss": 1.5858465433120728,
1449
+ "eval_runtime": 2.1875,
1450
+ "eval_samples_per_second": 0.914,
1451
+ "eval_steps_per_second": 0.457,
1452
+ "step": 190
1453
+ },
1454
+ {
1455
+ "epoch": 2.27,
1456
+ "learning_rate": 2.974978374403147e-05,
1457
+ "loss": 1.4448,
1458
+ "step": 191
1459
+ },
1460
+ {
1461
+ "epoch": 2.29,
1462
+ "learning_rate": 2.8831839169543996e-05,
1463
+ "loss": 1.2446,
1464
+ "step": 192
1465
+ },
1466
+ {
1467
+ "epoch": 2.3,
1468
+ "learning_rate": 2.7925888170101665e-05,
1469
+ "loss": 1.2843,
1470
+ "step": 193
1471
+ },
1472
+ {
1473
+ "epoch": 2.31,
1474
+ "learning_rate": 2.7032083420597e-05,
1475
+ "loss": 0.9528,
1476
+ "step": 194
1477
+ },
1478
+ {
1479
+ "epoch": 2.32,
1480
+ "learning_rate": 2.6150575548982292e-05,
1481
+ "loss": 1.1751,
1482
+ "step": 195
1483
+ },
1484
+ {
1485
+ "epoch": 2.32,
1486
+ "eval_loss": 1.5852700471878052,
1487
+ "eval_runtime": 2.1934,
1488
+ "eval_samples_per_second": 0.912,
1489
+ "eval_steps_per_second": 0.456,
1490
+ "step": 195
1491
+ },
1492
+ {
1493
+ "epoch": 2.33,
1494
+ "learning_rate": 2.528151311088537e-05,
1495
+ "loss": 1.2334,
1496
+ "step": 196
1497
+ },
1498
+ {
1499
+ "epoch": 2.35,
1500
+ "learning_rate": 2.4425042564574184e-05,
1501
+ "loss": 1.4127,
1502
+ "step": 197
1503
+ },
1504
+ {
1505
+ "epoch": 2.36,
1506
+ "learning_rate": 2.3581308246275103e-05,
1507
+ "loss": 1.1989,
1508
+ "step": 198
1509
+ },
1510
+ {
1511
+ "epoch": 2.37,
1512
+ "learning_rate": 2.2750452345848682e-05,
1513
+ "loss": 1.0506,
1514
+ "step": 199
1515
+ },
1516
+ {
1517
+ "epoch": 2.38,
1518
+ "learning_rate": 2.1932614882827197e-05,
1519
+ "loss": 1.5642,
1520
+ "step": 200
1521
+ },
1522
+ {
1523
+ "epoch": 2.38,
1524
+ "eval_loss": 1.5846655368804932,
1525
+ "eval_runtime": 2.1905,
1526
+ "eval_samples_per_second": 0.913,
1527
+ "eval_steps_per_second": 0.457,
1528
+ "step": 200
1529
+ },
1530
+ {
1531
+ "epoch": 2.39,
1532
+ "learning_rate": 2.112793368281799e-05,
1533
+ "loss": 0.9502,
1534
+ "step": 201
1535
+ },
1536
+ {
1537
+ "epoch": 2.4,
1538
+ "learning_rate": 2.03365443542764e-05,
1539
+ "loss": 1.5232,
1540
+ "step": 202
1541
+ },
1542
+ {
1543
+ "epoch": 2.42,
1544
+ "learning_rate": 1.9558580265652448e-05,
1545
+ "loss": 1.213,
1546
+ "step": 203
1547
+ },
1548
+ {
1549
+ "epoch": 2.43,
1550
+ "learning_rate": 1.879417252291502e-05,
1551
+ "loss": 0.7273,
1552
+ "step": 204
1553
+ },
1554
+ {
1555
+ "epoch": 2.44,
1556
+ "learning_rate": 1.804344994745727e-05,
1557
+ "loss": 1.6215,
1558
+ "step": 205
1559
+ },
1560
+ {
1561
+ "epoch": 2.44,
1562
+ "eval_loss": 1.5852676630020142,
1563
+ "eval_runtime": 2.1897,
1564
+ "eval_samples_per_second": 0.913,
1565
+ "eval_steps_per_second": 0.457,
1566
+ "step": 205
1567
+ },
1568
+ {
1569
+ "epoch": 2.45,
1570
+ "learning_rate": 1.730653905438714e-05,
1571
+ "loss": 1.5882,
1572
+ "step": 206
1573
+ },
1574
+ {
1575
+ "epoch": 2.46,
1576
+ "learning_rate": 1.6583564031206357e-05,
1577
+ "loss": 1.2459,
1578
+ "step": 207
1579
+ },
1580
+ {
1581
+ "epoch": 2.48,
1582
+ "learning_rate": 1.587464671688187e-05,
1583
+ "loss": 1.45,
1584
+ "step": 208
1585
+ },
1586
+ {
1587
+ "epoch": 2.49,
1588
+ "learning_rate": 1.5179906581313064e-05,
1589
+ "loss": 1.4953,
1590
+ "step": 209
1591
+ },
1592
+ {
1593
+ "epoch": 2.5,
1594
+ "learning_rate": 1.4499460705197998e-05,
1595
+ "loss": 0.9686,
1596
+ "step": 210
1597
+ },
1598
+ {
1599
+ "epoch": 2.5,
1600
+ "eval_loss": 1.5858163833618164,
1601
+ "eval_runtime": 2.1895,
1602
+ "eval_samples_per_second": 0.913,
1603
+ "eval_steps_per_second": 0.457,
1604
+ "step": 210
1605
+ },
1606
+ {
1607
+ "epoch": 2.51,
1608
+ "learning_rate": 1.3833423760302611e-05,
1609
+ "loss": 1.458,
1610
+ "step": 211
1611
+ },
1612
+ {
1613
+ "epoch": 2.52,
1614
+ "learning_rate": 1.3181907990135622e-05,
1615
+ "loss": 1.9958,
1616
+ "step": 212
1617
+ },
1618
+ {
1619
+ "epoch": 2.54,
1620
+ "learning_rate": 1.2545023191032801e-05,
1621
+ "loss": 1.2623,
1622
+ "step": 213
1623
+ },
1624
+ {
1625
+ "epoch": 2.55,
1626
+ "learning_rate": 1.1922876693653585e-05,
1627
+ "loss": 1.3237,
1628
+ "step": 214
1629
+ },
1630
+ {
1631
+ "epoch": 2.56,
1632
+ "learning_rate": 1.131557334489326e-05,
1633
+ "loss": 1.5978,
1634
+ "step": 215
1635
+ },
1636
+ {
1637
+ "epoch": 2.56,
1638
+ "eval_loss": 1.5844168663024902,
1639
+ "eval_runtime": 2.1926,
1640
+ "eval_samples_per_second": 0.912,
1641
+ "eval_steps_per_second": 0.456,
1642
+ "step": 215
1643
+ },
1644
+ {
1645
+ "epoch": 2.57,
1646
+ "learning_rate": 1.0723215490213634e-05,
1647
+ "loss": 1.5103,
1648
+ "step": 216
1649
+ },
1650
+ {
1651
+ "epoch": 2.58,
1652
+ "learning_rate": 1.0145902956395447e-05,
1653
+ "loss": 1.3013,
1654
+ "step": 217
1655
+ },
1656
+ {
1657
+ "epoch": 2.6,
1658
+ "learning_rate": 9.583733034714981e-06,
1659
+ "loss": 1.1485,
1660
+ "step": 218
1661
+ },
1662
+ {
1663
+ "epoch": 2.61,
1664
+ "learning_rate": 9.036800464548157e-06,
1665
+ "loss": 1.4783,
1666
+ "step": 219
1667
+ },
1668
+ {
1669
+ "epoch": 2.62,
1670
+ "learning_rate": 8.505197417404687e-06,
1671
+ "loss": 1.6576,
1672
+ "step": 220
1673
+ },
1674
+ {
1675
+ "epoch": 2.62,
1676
+ "eval_loss": 1.5853140354156494,
1677
+ "eval_runtime": 2.1872,
1678
+ "eval_samples_per_second": 0.914,
1679
+ "eval_steps_per_second": 0.457,
1680
+ "step": 220
1681
+ },
1682
+ {
1683
+ "epoch": 2.63,
1684
+ "learning_rate": 7.989013481394814e-06,
1685
+ "loss": 1.424,
1686
+ "step": 221
1687
+ },
1688
+ {
1689
+ "epoch": 2.64,
1690
+ "learning_rate": 7.488335646131628e-06,
1691
+ "loss": 1.0807,
1692
+ "step": 222
1693
+ },
1694
+ {
1695
+ "epoch": 2.65,
1696
+ "learning_rate": 7.003248288071118e-06,
1697
+ "loss": 1.2306,
1698
+ "step": 223
1699
+ },
1700
+ {
1701
+ "epoch": 2.67,
1702
+ "learning_rate": 6.533833156292679e-06,
1703
+ "loss": 1.0481,
1704
+ "step": 224
1705
+ },
1706
+ {
1707
+ "epoch": 2.68,
1708
+ "learning_rate": 6.08016935872251e-06,
1709
+ "loss": 1.6427,
1710
+ "step": 225
1711
+ },
1712
+ {
1713
+ "epoch": 2.68,
1714
+ "eval_loss": 1.5852783918380737,
1715
+ "eval_runtime": 2.1896,
1716
+ "eval_samples_per_second": 0.913,
1717
+ "eval_steps_per_second": 0.457,
1718
+ "step": 225
1719
+ },
1720
+ {
1721
+ "epoch": 2.69,
1722
+ "learning_rate": 5.6423333488018095e-06,
1723
+ "loss": 0.9116,
1724
+ "step": 226
1725
+ },
1726
+ {
1727
+ "epoch": 2.7,
1728
+ "learning_rate": 5.22039891260262e-06,
1729
+ "loss": 1.2006,
1730
+ "step": 227
1731
+ },
1732
+ {
1733
+ "epoch": 2.71,
1734
+ "learning_rate": 4.8144371563930476e-06,
1735
+ "loss": 2.1991,
1736
+ "step": 228
1737
+ },
1738
+ {
1739
+ "epoch": 2.73,
1740
+ "learning_rate": 4.424516494654118e-06,
1741
+ "loss": 1.6851,
1742
+ "step": 229
1743
+ },
1744
+ {
1745
+ "epoch": 2.74,
1746
+ "learning_rate": 4.050702638550275e-06,
1747
+ "loss": 1.4142,
1748
+ "step": 230
1749
+ },
1750
+ {
1751
+ "epoch": 2.74,
1752
+ "eval_loss": 1.5846909284591675,
1753
+ "eval_runtime": 2.1882,
1754
+ "eval_samples_per_second": 0.914,
1755
+ "eval_steps_per_second": 0.457,
1756
+ "step": 230
1757
+ },
1758
+ {
1759
+ "epoch": 2.75,
1760
+ "learning_rate": 3.693058584855369e-06,
1761
+ "loss": 1.7414,
1762
+ "step": 231
1763
+ },
1764
+ {
1765
+ "epoch": 2.76,
1766
+ "learning_rate": 3.3516446053363015e-06,
1767
+ "loss": 1.6624,
1768
+ "step": 232
1769
+ },
1770
+ {
1771
+ "epoch": 2.77,
1772
+ "learning_rate": 3.026518236595621e-06,
1773
+ "loss": 1.1288,
1774
+ "step": 233
1775
+ },
1776
+ {
1777
+ "epoch": 2.79,
1778
+ "learning_rate": 2.717734270375272e-06,
1779
+ "loss": 1.0813,
1780
+ "step": 234
1781
+ },
1782
+ {
1783
+ "epoch": 2.8,
1784
+ "learning_rate": 2.4253447443228106e-06,
1785
+ "loss": 1.0923,
1786
+ "step": 235
1787
+ },
1788
+ {
1789
+ "epoch": 2.8,
1790
+ "eval_loss": 1.5842970609664917,
1791
+ "eval_runtime": 2.1911,
1792
+ "eval_samples_per_second": 0.913,
1793
+ "eval_steps_per_second": 0.456,
1794
+ "step": 235
1795
+ },
1796
+ {
1797
+ "epoch": 2.81,
1798
+ "learning_rate": 2.1493989332218468e-06,
1799
+ "loss": 1.3077,
1800
+ "step": 236
1801
+ },
1802
+ {
1803
+ "epoch": 2.82,
1804
+ "learning_rate": 1.8899433406879608e-06,
1805
+ "loss": 1.1672,
1806
+ "step": 237
1807
+ },
1808
+ {
1809
+ "epoch": 2.83,
1810
+ "learning_rate": 1.6470216913317626e-06,
1811
+ "loss": 1.2432,
1812
+ "step": 238
1813
+ },
1814
+ {
1815
+ "epoch": 2.85,
1816
+ "learning_rate": 1.4206749233902084e-06,
1817
+ "loss": 1.4491,
1818
+ "step": 239
1819
+ },
1820
+ {
1821
+ "epoch": 2.86,
1822
+ "learning_rate": 1.2109411818274852e-06,
1823
+ "loss": 1.5509,
1824
+ "step": 240
1825
+ },
1826
+ {
1827
+ "epoch": 2.86,
1828
+ "eval_loss": 1.5852304697036743,
1829
+ "eval_runtime": 2.1903,
1830
+ "eval_samples_per_second": 0.913,
1831
+ "eval_steps_per_second": 0.457,
1832
+ "step": 240
1833
+ }
1834
+ ],
1835
+ "max_steps": 252,
1836
+ "num_train_epochs": 3,
1837
+ "total_flos": 1.2057721852974034e+17,
1838
+ "trial_name": null,
1839
+ "trial_params": null
1840
+ }
checkpoint-240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6262ba37ace7774e5d22c32c9b42a5166ed6929715b00a62cbc99ddcea368d8
3
+ size 3899