robertou2 commited on
Commit
38c4f5d
·
verified ·
1 Parent(s): 1066b49

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -23,15 +23,15 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "gate_proj",
27
- "q_proj",
28
- "down_proj",
29
- "o_proj",
30
  "v_proj",
31
  "k_proj",
32
- "up_proj"
 
 
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
36
- "use_rslora": false
37
  }
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
 
 
26
  "v_proj",
27
  "k_proj",
28
+ "down_proj",
29
+ "gate_proj",
30
+ "up_proj",
31
+ "q_proj",
32
+ "o_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
36
+ "use_rslora": true
37
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:862b443d2450a20b1969cb13018bb8da9e546d83e7a72d7b2bd47a1c01915e78
3
  size 359270696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cfa09893426ca5c80e5e2808b3bf5162a00330603bce40998c1bcbaaf85fbd4
3
  size 359270696
chat_template.jinja CHANGED
@@ -1,54 +1,54 @@
1
  {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0]['role'] == 'system' %}
4
- {{- messages[0]['content'] }}
5
- {%- else %}
6
- {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
- {%- endif %}
8
- {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
- {%- for tool in tools %}
10
- {{- "\n" }}
11
- {{- tool | tojson }}
12
- {%- endfor %}
13
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
- {%- else %}
15
- {%- if messages[0]['role'] == 'system' %}
16
- {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
- {%- else %}
18
- {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
- {%- endif %}
20
- {%- endif %}
21
- {%- for message in messages %}
22
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
- {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
- {%- elif message.role == "assistant" %}
25
- {{- '<|im_start|>' + message.role }}
26
- {%- if message.content %}
27
- {{- '\n' + message.content }}
28
  {%- endif %}
29
- {%- for tool_call in message.tool_calls %}
30
- {%- if tool_call.function is defined %}
31
- {%- set tool_call = tool_call.function %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  {%- endif %}
33
- {{- '\n<tool_call>\n{"name": "' }}
34
- {{- tool_call.name }}
35
- {{- '", "arguments": ' }}
36
- {{- tool_call.arguments | tojson }}
37
- {{- '}\n</tool_call>' }}
38
  {%- endfor %}
39
- {{- '<|im_end|>\n' }}
40
- {%- elif message.role == "tool" %}
41
- {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
- {{- '<|im_start|>user' }}
43
- {%- endif %}
44
- {{- '\n<tool_response>\n' }}
45
- {{- message.content }}
46
- {{- '\n</tool_response>' }}
47
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
- {{- '<|im_end|>\n' }}
49
- {%- endif %}
50
- {%- endif %}
51
- {%- endfor %}
52
- {%- if add_generation_prompt %}
53
- {{- '<|im_start|>assistant\n' }}
54
- {%- endif %}
 
1
  {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
 
 
 
 
 
 
 
 
20
  {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
  {%- endif %}
 
 
 
 
 
51
  {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e98c64ffd468a5f8da406fd5dc414c5741d65a4905df84235391c6878ed2d08
3
  size 718831691
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:049df29a3abfd9172bca3bfb88a156988cc0cd764359c0a8a1a4576bb303e165
3
  size 718831691
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd8886924146c94ca2c895faa5a5126787af796176eddb058f7b8a6d9788a808
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f4130be9f8a4a1e2287e97f0f05456d51bf10d246bfde8a5ae0ae5d4681327e
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d930a4283cac06b311f5bf6385eb10f11a487cd4fc36bfbeeadba03f011f80ef
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:740fa2b047673c71a73647826d1867382ae011b9a31714631a50f15909a18934
3
  size 1465
trainer_state.json CHANGED
@@ -2,1069 +2,1034 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 37.61538461538461,
6
  "eval_steps": 500,
7
- "global_step": 150,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.3076923076923077,
14
- "grad_norm": 1.6579457521438599,
15
  "learning_rate": 0.0,
16
  "loss": 2.0013,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.6153846153846154,
21
- "grad_norm": 2.071979522705078,
22
- "learning_rate": 3.3333333333333335e-05,
23
  "loss": 2.512,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.9230769230769231,
28
- "grad_norm": 1.5595824718475342,
29
- "learning_rate": 6.666666666666667e-05,
30
- "loss": 2.4665,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 1.0,
35
- "grad_norm": 1.4536011219024658,
36
- "learning_rate": 0.0001,
37
- "loss": 2.4421,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 1.3076923076923077,
42
- "grad_norm": 1.0175144672393799,
43
- "learning_rate": 0.00013333333333333334,
44
- "loss": 2.2619,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 1.6153846153846154,
49
- "grad_norm": 0.6594896912574768,
50
- "learning_rate": 0.00016666666666666666,
51
- "loss": 2.3238,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 1.9230769230769231,
56
- "grad_norm": 0.4992673993110657,
57
- "learning_rate": 0.0002,
58
- "loss": 1.7963,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 2.0,
63
- "grad_norm": 1.0101423263549805,
64
- "learning_rate": 0.00023333333333333333,
65
- "loss": 2.2351,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 2.3076923076923075,
70
- "grad_norm": 0.6238862872123718,
71
- "learning_rate": 0.0002666666666666667,
72
- "loss": 2.1242,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 2.6153846153846154,
77
- "grad_norm": 0.49207818508148193,
78
- "learning_rate": 0.0003,
79
- "loss": 1.3944,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 2.9230769230769234,
84
- "grad_norm": 0.5449758768081665,
85
- "learning_rate": 0.0003333333333333333,
86
- "loss": 1.9957,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 3.0,
91
- "grad_norm": 1.2987996339797974,
92
- "learning_rate": 0.00036666666666666667,
93
- "loss": 1.7485,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 3.3076923076923075,
98
- "grad_norm": 0.7771750688552856,
99
- "learning_rate": 0.0004,
100
- "loss": 1.7729,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 3.6153846153846154,
105
- "grad_norm": 0.5679973363876343,
106
- "learning_rate": 0.00043333333333333337,
107
- "loss": 1.7897,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 3.9230769230769234,
112
- "grad_norm": 0.5018688440322876,
113
- "learning_rate": 0.00046666666666666666,
114
- "loss": 1.0134,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 4.0,
119
- "grad_norm": 1.9658831357955933,
120
- "learning_rate": 0.0005,
121
- "loss": 1.6495,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 4.3076923076923075,
126
- "grad_norm": 0.5439109206199646,
127
- "learning_rate": 0.0004999848114735858,
128
- "loss": 0.7469,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 4.615384615384615,
133
- "grad_norm": 0.7974035143852234,
134
- "learning_rate": 0.0004999392477398737,
135
- "loss": 1.1807,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 4.923076923076923,
140
- "grad_norm": 0.6797612309455872,
141
- "learning_rate": 0.0004998633143352315,
142
- "loss": 1.3357,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 5.0,
147
- "grad_norm": 1.6918320655822754,
148
- "learning_rate": 0.0004997570204861915,
149
- "loss": 1.176,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 5.3076923076923075,
154
- "grad_norm": 0.8325671553611755,
155
- "learning_rate": 0.000499620379108329,
156
- "loss": 0.9282,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 5.615384615384615,
161
- "grad_norm": 0.6552557945251465,
162
- "learning_rate": 0.0004994534068046936,
163
- "loss": 0.649,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 5.923076923076923,
168
- "grad_norm": 1.1335406303405762,
169
- "learning_rate": 0.0004992561238637912,
170
- "loss": 0.7929,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 6.0,
175
- "grad_norm": 2.5653879642486572,
176
- "learning_rate": 0.000499028554257119,
177
- "loss": 0.6364,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 6.3076923076923075,
182
- "grad_norm": 1.0426138639450073,
183
- "learning_rate": 0.0004987707256362529,
184
- "loss": 0.6463,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 6.615384615384615,
189
- "grad_norm": 0.8789294958114624,
190
- "learning_rate": 0.0004984826693294874,
191
- "loss": 0.5646,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 6.923076923076923,
196
- "grad_norm": 0.8288877606391907,
197
- "learning_rate": 0.0004981644203380291,
198
- "loss": 0.5083,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 7.0,
203
- "grad_norm": 3.075854778289795,
204
- "learning_rate": 0.0004978160173317438,
205
- "loss": 0.463,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 7.3076923076923075,
210
- "grad_norm": 1.1487135887145996,
211
- "learning_rate": 0.0004974375026444575,
212
- "loss": 0.3834,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 7.615384615384615,
217
- "grad_norm": 1.2224117517471313,
218
- "learning_rate": 0.0004970289222688129,
219
- "loss": 0.5554,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 7.923076923076923,
224
- "grad_norm": 0.5958878993988037,
225
- "learning_rate": 0.0004965903258506806,
226
- "loss": 0.264,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 8.0,
231
- "grad_norm": 1.5285398960113525,
232
- "learning_rate": 0.0004961217666831268,
233
- "loss": 0.1209,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 8.307692307692308,
238
- "grad_norm": 0.8856319785118103,
239
- "learning_rate": 0.0004956233016999379,
240
- "loss": 0.3155,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 8.615384615384615,
245
- "grad_norm": 0.7143183350563049,
246
- "learning_rate": 0.0004950949914687023,
247
- "loss": 0.2577,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 8.923076923076923,
252
- "grad_norm": 0.6463532447814941,
253
- "learning_rate": 0.0004945369001834514,
254
- "loss": 0.2509,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 9.0,
259
- "grad_norm": 1.5944372415542603,
260
- "learning_rate": 0.0004939490956568589,
261
- "loss": 0.2712,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 9.307692307692308,
266
- "grad_norm": 0.6741620302200317,
267
- "learning_rate": 0.0004933316493120015,
268
- "loss": 0.1793,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 9.615384615384615,
273
- "grad_norm": 0.8169341683387756,
274
- "learning_rate": 0.00049268463617368,
275
- "loss": 0.2251,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 9.923076923076923,
280
- "grad_norm": 0.47695809602737427,
281
- "learning_rate": 0.0004920081348593038,
282
- "loss": 0.1291,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 10.0,
287
- "grad_norm": 0.9237103462219238,
288
- "learning_rate": 0.0004913022275693372,
289
- "loss": 0.1339,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 10.307692307692308,
294
- "grad_norm": 0.48380130529403687,
295
- "learning_rate": 0.0004905670000773126,
296
- "loss": 0.105,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 10.615384615384615,
301
- "grad_norm": 0.8409221768379211,
302
- "learning_rate": 0.0004898025417194075,
303
- "loss": 0.1608,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 10.923076923076923,
308
- "grad_norm": 0.36892759799957275,
309
- "learning_rate": 0.0004890089453835894,
310
- "loss": 0.0887,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 11.0,
315
- "grad_norm": 1.5985524654388428,
316
- "learning_rate": 0.00048818630749832974,
317
- "loss": 0.1622,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 11.307692307692308,
322
- "grad_norm": 0.5602894425392151,
323
- "learning_rate": 0.00048733472802088654,
324
- "loss": 0.0832,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 11.615384615384615,
329
- "grad_norm": 0.6063843369483948,
330
- "learning_rate": 0.00048645431042515866,
331
- "loss": 0.1001,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 11.923076923076923,
336
- "grad_norm": 0.6315516233444214,
337
- "learning_rate": 0.00048554516168911364,
338
- "loss": 0.118,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 12.0,
343
- "grad_norm": 0.10256657004356384,
344
- "learning_rate": 0.000484607392281788,
345
- "loss": 0.0136,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 12.307692307692308,
350
- "grad_norm": 0.38051220774650574,
351
- "learning_rate": 0.0004836411161498652,
352
- "loss": 0.0501,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 12.615384615384615,
357
- "grad_norm": 0.3163893222808838,
358
- "learning_rate": 0.0004826464507038296,
359
- "loss": 0.0453,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 12.923076923076923,
364
- "grad_norm": 0.41999366879463196,
365
- "learning_rate": 0.0004816235168037004,
366
- "loss": 0.0556,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 13.0,
371
- "grad_norm": 0.46970197558403015,
372
- "learning_rate": 0.0004805724387443462,
373
- "loss": 0.0511,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 13.307692307692308,
378
- "grad_norm": 0.19861207902431488,
379
- "learning_rate": 0.00047949334424038175,
380
- "loss": 0.024,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 13.615384615384615,
385
- "grad_norm": 0.26438194513320923,
386
- "learning_rate": 0.0004783863644106502,
387
- "loss": 0.0339,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 13.923076923076923,
392
- "grad_norm": 0.15319757163524628,
393
- "learning_rate": 0.00047725163376229063,
394
- "loss": 0.0217,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 14.0,
399
- "grad_norm": 3.167978286743164,
400
- "learning_rate": 0.0004760892901743944,
401
- "loss": 0.0537,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 14.307692307692308,
406
- "grad_norm": 0.19109131395816803,
407
- "learning_rate": 0.00047489947488125176,
408
- "loss": 0.0184,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 14.615384615384615,
413
- "grad_norm": 0.13836173713207245,
414
- "learning_rate": 0.0004736823324551909,
415
- "loss": 0.0131,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 14.923076923076923,
420
- "grad_norm": 0.19333595037460327,
421
- "learning_rate": 0.00047243801078901084,
422
- "loss": 0.0213,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 15.0,
427
- "grad_norm": 0.5175873041152954,
428
- "learning_rate": 0.0004711666610780115,
429
- "loss": 0.0192,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 15.307692307692308,
434
- "grad_norm": 0.102613165974617,
435
- "learning_rate": 0.00046986843780162223,
436
- "loss": 0.0125,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 15.615384615384615,
441
- "grad_norm": 0.19265614449977875,
442
- "learning_rate": 0.00046854349870463144,
443
- "loss": 0.0149,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 15.923076923076923,
448
- "grad_norm": 0.08247430622577667,
449
- "learning_rate": 0.0004671920047780186,
450
- "loss": 0.0088,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 16.0,
455
- "grad_norm": 0.1660240739583969,
456
- "learning_rate": 0.0004658141202393935,
457
- "loss": 0.0136,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 16.307692307692307,
462
- "grad_norm": 0.09002210944890976,
463
- "learning_rate": 0.00046441001251304177,
464
- "loss": 0.0085,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 16.615384615384617,
469
- "grad_norm": 0.14277450740337372,
470
- "learning_rate": 0.0004629798522095818,
471
- "loss": 0.0145,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 16.923076923076923,
476
- "grad_norm": 0.0343237966299057,
477
- "learning_rate": 0.00046152381310523384,
478
- "loss": 0.0045,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 17.0,
483
- "grad_norm": 0.11968665570020676,
484
- "learning_rate": 0.00046004207212070527,
485
- "loss": 0.0078,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 17.307692307692307,
490
- "grad_norm": 0.035490620881319046,
491
- "learning_rate": 0.0004585348092996925,
492
- "loss": 0.0047,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 17.615384615384617,
497
- "grad_norm": 0.06462971121072769,
498
- "learning_rate": 0.000457002207787005,
499
- "loss": 0.0081,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 17.923076923076923,
504
- "grad_norm": 0.08005011081695557,
505
- "learning_rate": 0.00045544445380631127,
506
- "loss": 0.0071,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 18.0,
511
- "grad_norm": 0.08014772832393646,
512
- "learning_rate": 0.0004538617366375112,
513
- "loss": 0.0043,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 18.307692307692307,
518
- "grad_norm": 0.045388974249362946,
519
- "learning_rate": 0.0004522542485937369,
520
- "loss": 0.006,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 18.615384615384617,
525
- "grad_norm": 0.027296222746372223,
526
- "learning_rate": 0.0004506221849979852,
527
  "loss": 0.0042,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 18.923076923076923,
532
- "grad_norm": 0.04167836531996727,
533
- "learning_rate": 0.0004489657441593846,
534
- "loss": 0.0036,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 19.0,
539
- "grad_norm": 0.09567083418369293,
540
- "learning_rate": 0.00044728512734909845,
541
- "loss": 0.0051,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 19.307692307692307,
546
- "grad_norm": 0.017897693440318108,
547
- "learning_rate": 0.00044558053877586913,
548
- "loss": 0.0022,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 19.615384615384617,
553
- "grad_norm": 0.024916386231780052,
554
- "learning_rate": 0.0004438521855612054,
555
- "loss": 0.0035,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 19.923076923076923,
560
- "grad_norm": 0.016562633216381073,
561
- "learning_rate": 0.0004421002777142148,
562
- "loss": 0.0036,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 20.0,
567
- "grad_norm": 2.793907880783081,
568
- "learning_rate": 0.0004403250281060862,
569
- "loss": 0.0357,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 20.307692307692307,
574
- "grad_norm": 0.015240938402712345,
575
- "learning_rate": 0.0004385266524442241,
576
- "loss": 0.0019,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 20.615384615384617,
581
- "grad_norm": 0.01802099496126175,
582
- "learning_rate": 0.0004367053692460385,
583
- "loss": 0.0028,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 20.923076923076923,
588
- "grad_norm": 0.029564393684267998,
589
- "learning_rate": 0.00043486139981239303,
590
- "loss": 0.0036,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 21.0,
595
- "grad_norm": 0.10444993525743484,
596
- "learning_rate": 0.0004329949682007154,
597
- "loss": 0.0036,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 21.307692307692307,
602
- "grad_norm": 0.022473925724625587,
603
- "learning_rate": 0.0004311063011977723,
604
- "loss": 0.0028,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 21.615384615384617,
609
- "grad_norm": 0.00846798438578844,
610
- "learning_rate": 0.00042919562829211283,
611
- "loss": 0.0021,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 21.923076923076923,
616
- "grad_norm": 0.019519299268722534,
617
- "learning_rate": 0.0004272631816461843,
618
- "loss": 0.0025,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 22.0,
623
- "grad_norm": 0.02333137020468712,
624
- "learning_rate": 0.00042530919606812215,
625
- "loss": 0.0026,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 22.307692307692307,
630
- "grad_norm": 0.008181007578969002,
631
- "learning_rate": 0.0004233339089832189,
632
- "loss": 0.0022,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 22.615384615384617,
637
- "grad_norm": 0.024723347276449203,
638
- "learning_rate": 0.000421337560405075,
639
- "loss": 0.0019,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 22.923076923076923,
644
- "grad_norm": 0.01907787285745144,
645
- "learning_rate": 0.0004193203929064353,
646
  "loss": 0.0023,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 23.0,
651
- "grad_norm": 0.023070262745022774,
652
- "learning_rate": 0.0004172826515897146,
653
- "loss": 0.002,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 23.307692307692307,
658
- "grad_norm": 0.008314975537359715,
659
- "learning_rate": 0.0004152245840572153,
660
- "loss": 0.0019,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 23.615384615384617,
665
- "grad_norm": 0.008056281134486198,
666
- "learning_rate": 0.00041314644038104216,
667
- "loss": 0.0022,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 23.923076923076923,
672
- "grad_norm": 0.01323688868433237,
673
- "learning_rate": 0.0004110484730727161,
674
- "loss": 0.0015,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 24.0,
679
- "grad_norm": 0.009244530461728573,
680
- "learning_rate": 0.0004089309370524921,
681
- "loss": 0.0017,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 24.307692307692307,
686
- "grad_norm": 0.006018282379955053,
687
- "learning_rate": 0.00040679408961838426,
688
  "loss": 0.0017,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 24.615384615384617,
693
- "grad_norm": 0.005748262628912926,
694
- "learning_rate": 0.00040463819041490235,
695
  "loss": 0.0019,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 24.923076923076923,
700
- "grad_norm": 0.005269130691885948,
701
- "learning_rate": 0.0004024635014015023,
702
- "loss": 0.0013,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 25.0,
707
- "grad_norm": 0.18421179056167603,
708
- "learning_rate": 0.00040027028682075626,
709
- "loss": 0.005,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 25.307692307692307,
714
- "grad_norm": 0.004292176570743322,
715
- "learning_rate": 0.000398058813166245,
716
  "loss": 0.0012,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 25.615384615384617,
721
- "grad_norm": 0.13048163056373596,
722
- "learning_rate": 0.00039582934915017665,
723
- "loss": 0.0078,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 25.923076923076923,
728
- "grad_norm": 0.006738720927387476,
729
- "learning_rate": 0.0003935821656707359,
730
- "loss": 0.0017,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 26.0,
735
- "grad_norm": 0.006725949235260487,
736
- "learning_rate": 0.00039131753577916796,
737
- "loss": 0.0015,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 26.307692307692307,
742
- "grad_norm": 0.03537844121456146,
743
- "learning_rate": 0.00038903573464660015,
744
- "loss": 0.0031,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 26.615384615384617,
749
- "grad_norm": 0.0065828608348965645,
750
- "learning_rate": 0.00038673703953060677,
751
- "loss": 0.0015,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 26.923076923076923,
756
- "grad_norm": 0.005749302916228771,
757
- "learning_rate": 0.00038442172974151957,
758
- "loss": 0.0015,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 27.0,
763
- "grad_norm": 0.012811440974473953,
764
- "learning_rate": 0.00038209008660848977,
765
  "loss": 0.0014,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 27.307692307692307,
770
- "grad_norm": 0.016203908249735832,
771
- "learning_rate": 0.0003797423934453038,
772
- "loss": 0.0013,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 27.615384615384617,
777
- "grad_norm": 0.00788620300590992,
778
- "learning_rate": 0.0003773789355159587,
779
- "loss": 0.0014,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 27.923076923076923,
784
- "grad_norm": 0.004134251270443201,
785
- "learning_rate": 0.000375,
786
  "loss": 0.0014,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 28.0,
791
- "grad_norm": 0.0072965058498084545,
792
- "learning_rate": 0.00037260587595762705,
793
- "loss": 0.0012,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 28.307692307692307,
798
- "grad_norm": 0.004400895908474922,
799
- "learning_rate": 0.00037019685429456986,
800
  "loss": 0.0014,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 28.615384615384617,
805
- "grad_norm": 0.0042018345557153225,
806
- "learning_rate": 0.0003677732277267418,
807
- "loss": 0.0011,
808
  "step": 114
809
  },
810
  {
811
  "epoch": 28.923076923076923,
812
- "grad_norm": 0.02602436952292919,
813
- "learning_rate": 0.000365335290744672,
814
- "loss": 0.0016,
815
  "step": 115
816
  },
817
  {
818
  "epoch": 29.0,
819
- "grad_norm": 0.008391394279897213,
820
- "learning_rate": 0.0003628833395777224,
821
  "loss": 0.0011,
822
  "step": 116
823
  },
824
  {
825
  "epoch": 29.307692307692307,
826
- "grad_norm": 0.009821321815252304,
827
- "learning_rate": 0.00036041767215809354,
828
- "loss": 0.0011,
829
  "step": 117
830
  },
831
  {
832
  "epoch": 29.615384615384617,
833
- "grad_norm": 0.005064834374934435,
834
- "learning_rate": 0.0003579385880846232,
835
- "loss": 0.0014,
836
  "step": 118
837
  },
838
  {
839
  "epoch": 29.923076923076923,
840
- "grad_norm": 0.003741420805454254,
841
- "learning_rate": 0.00035544638858638305,
842
  "loss": 0.0013,
843
  "step": 119
844
  },
845
  {
846
  "epoch": 30.0,
847
- "grad_norm": 0.005992499180138111,
848
- "learning_rate": 0.00035294137648607626,
849
- "loss": 0.0011,
850
  "step": 120
851
  },
852
  {
853
  "epoch": 30.307692307692307,
854
- "grad_norm": 0.0025831812527030706,
855
- "learning_rate": 0.0003504238561632424,
856
- "loss": 0.0009,
857
  "step": 121
858
  },
859
  {
860
  "epoch": 30.615384615384617,
861
- "grad_norm": 0.024449031800031662,
862
- "learning_rate": 0.0003478941335172729,
863
  "loss": 0.0012,
864
  "step": 122
865
  },
866
  {
867
  "epoch": 30.923076923076923,
868
- "grad_norm": 0.00358560006134212,
869
- "learning_rate": 0.0003453525159302415,
870
  "loss": 0.0011,
871
  "step": 123
872
  },
873
  {
874
  "epoch": 31.0,
875
- "grad_norm": 0.005536896176636219,
876
- "learning_rate": 0.00034279931222955517,
877
- "loss": 0.0014,
878
  "step": 124
879
  },
880
  {
881
  "epoch": 31.307692307692307,
882
- "grad_norm": 0.0029202536679804325,
883
- "learning_rate": 0.0003402348326504287,
884
- "loss": 0.0011,
885
  "step": 125
886
  },
887
  {
888
  "epoch": 31.615384615384617,
889
- "grad_norm": 0.0022518346086144447,
890
- "learning_rate": 0.00033765938879818866,
891
- "loss": 0.0009,
892
  "step": 126
893
  },
894
  {
895
  "epoch": 31.923076923076923,
896
- "grad_norm": 0.01685263216495514,
897
- "learning_rate": 0.0003350732936104108,
898
- "loss": 0.0011,
899
  "step": 127
900
  },
901
  {
902
  "epoch": 32.0,
903
- "grad_norm": 0.00378896901383996,
904
- "learning_rate": 0.0003324768613188957,
905
- "loss": 0.0011,
906
  "step": 128
907
  },
908
  {
909
  "epoch": 32.30769230769231,
910
- "grad_norm": 0.004914000164717436,
911
- "learning_rate": 0.00032987040741148704,
912
- "loss": 0.0008,
913
  "step": 129
914
  },
915
  {
916
  "epoch": 32.61538461538461,
917
- "grad_norm": 0.003984972834587097,
918
- "learning_rate": 0.00032725424859373687,
919
- "loss": 0.001,
920
  "step": 130
921
  },
922
  {
923
  "epoch": 32.92307692307692,
924
- "grad_norm": 0.002909860573709011,
925
- "learning_rate": 0.00032462870275042365,
926
- "loss": 0.0012,
927
  "step": 131
928
  },
929
  {
930
  "epoch": 33.0,
931
- "grad_norm": 0.008864074014127254,
932
- "learning_rate": 0.00032199408890692656,
933
- "loss": 0.0008,
934
  "step": 132
935
  },
936
  {
937
  "epoch": 33.30769230769231,
938
- "grad_norm": 0.0025989420246332884,
939
- "learning_rate": 0.00031935072719046115,
940
- "loss": 0.001,
941
  "step": 133
942
  },
943
  {
944
  "epoch": 33.61538461538461,
945
- "grad_norm": 0.0020333165302872658,
946
- "learning_rate": 0.00031669893879118153,
947
- "loss": 0.0008,
948
  "step": 134
949
  },
950
  {
951
  "epoch": 33.92307692307692,
952
- "grad_norm": 0.005124307703226805,
953
- "learning_rate": 0.0003140390459231529,
954
- "loss": 0.001,
955
  "step": 135
956
  },
957
  {
958
  "epoch": 34.0,
959
- "grad_norm": 0.005078117363154888,
960
- "learning_rate": 0.0003113713717851998,
961
- "loss": 0.0007,
962
  "step": 136
963
  },
964
  {
965
  "epoch": 34.30769230769231,
966
- "grad_norm": 0.0023773626890033484,
967
- "learning_rate": 0.00030869624052163523,
968
- "loss": 0.001,
969
  "step": 137
970
  },
971
  {
972
  "epoch": 34.61538461538461,
973
- "grad_norm": 0.0029469747096300125,
974
- "learning_rate": 0.000306013977182874,
975
- "loss": 0.0006,
976
  "step": 138
977
  },
978
  {
979
  "epoch": 34.92307692307692,
980
- "grad_norm": 0.002441684016957879,
981
- "learning_rate": 0.0003033249076859367,
982
- "loss": 0.0009,
983
  "step": 139
984
  },
985
  {
986
  "epoch": 35.0,
987
- "grad_norm": 0.005331180989742279,
988
- "learning_rate": 0.00030062935877484806,
989
- "loss": 0.0012,
990
  "step": 140
991
  },
992
  {
993
  "epoch": 35.30769230769231,
994
- "grad_norm": 0.0024844056461006403,
995
- "learning_rate": 0.0002979276579809346,
996
- "loss": 0.0009,
997
  "step": 141
998
  },
999
  {
1000
  "epoch": 35.61538461538461,
1001
- "grad_norm": 0.002123428974300623,
1002
- "learning_rate": 0.0002952201335830275,
1003
- "loss": 0.0008,
1004
  "step": 142
1005
  },
1006
  {
1007
  "epoch": 35.92307692307692,
1008
- "grad_norm": 0.0018413775833323598,
1009
- "learning_rate": 0.00029250711456757327,
1010
- "loss": 0.0008,
1011
  "step": 143
1012
  },
1013
  {
1014
  "epoch": 36.0,
1015
- "grad_norm": 0.007370408158749342,
1016
- "learning_rate": 0.00028978893058865987,
1017
- "loss": 0.0009,
1018
  "step": 144
1019
  },
1020
  {
1021
  "epoch": 36.30769230769231,
1022
- "grad_norm": 0.0015665763057768345,
1023
- "learning_rate": 0.0002870659119279605,
1024
- "loss": 0.0006,
1025
- "step": 145
1026
- },
1027
- {
1028
- "epoch": 36.61538461538461,
1029
- "grad_norm": 0.002923523774370551,
1030
- "learning_rate": 0.00028433838945460206,
1031
- "loss": 0.0009,
1032
- "step": 146
1033
- },
1034
- {
1035
- "epoch": 36.92307692307692,
1036
- "grad_norm": 0.002343183383345604,
1037
- "learning_rate": 0.0002816066945849616,
1038
- "loss": 0.001,
1039
- "step": 147
1040
- },
1041
- {
1042
- "epoch": 37.0,
1043
- "grad_norm": 0.0035140886902809143,
1044
- "learning_rate": 0.0002788711592423966,
1045
  "loss": 0.0008,
1046
- "step": 148
1047
- },
1048
- {
1049
- "epoch": 37.30769230769231,
1050
- "grad_norm": 0.0017854305915534496,
1051
- "learning_rate": 0.0002761321158169134,
1052
- "loss": 0.0006,
1053
- "step": 149
1054
- },
1055
- {
1056
- "epoch": 37.61538461538461,
1057
- "grad_norm": 0.002159240422770381,
1058
- "learning_rate": 0.0002733898971247795,
1059
- "loss": 0.0009,
1060
- "step": 150
1061
  }
1062
  ],
1063
  "logging_steps": 1,
1064
- "max_steps": 300,
1065
  "num_input_tokens_seen": 0,
1066
- "num_train_epochs": 75,
1067
- "save_steps": 10,
1068
  "stateful_callbacks": {
1069
  "TrainerControl": {
1070
  "args": {
@@ -1077,7 +1042,7 @@
1077
  "attributes": {}
1078
  }
1079
  },
1080
- "total_flos": 4.437176574055219e+16,
1081
  "train_batch_size": 2,
1082
  "trial_name": null,
1083
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 36.30769230769231,
6
  "eval_steps": 500,
7
+ "global_step": 145,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.3076923076923077,
14
+ "grad_norm": 11.430416107177734,
15
  "learning_rate": 0.0,
16
  "loss": 2.0013,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.6153846153846154,
21
+ "grad_norm": 14.13573169708252,
22
+ "learning_rate": 6.666666666666667e-06,
23
  "loss": 2.512,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.9230769230769231,
28
+ "grad_norm": 9.26183032989502,
29
+ "learning_rate": 1.3333333333333333e-05,
30
+ "loss": 2.4408,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 1.0,
35
+ "grad_norm": 8.897106170654297,
36
+ "learning_rate": 2e-05,
37
+ "loss": 2.4201,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 1.3076923076923077,
42
+ "grad_norm": 6.057820796966553,
43
+ "learning_rate": 2.6666666666666667e-05,
44
+ "loss": 2.2292,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 1.6153846153846154,
49
+ "grad_norm": 4.151742935180664,
50
+ "learning_rate": 3.3333333333333335e-05,
51
+ "loss": 2.2984,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 1.9230769230769231,
56
+ "grad_norm": 2.9261631965637207,
57
+ "learning_rate": 4e-05,
58
+ "loss": 1.7644,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 2.0,
63
+ "grad_norm": 6.626205921173096,
64
+ "learning_rate": 4.666666666666667e-05,
65
+ "loss": 2.1861,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 2.3076923076923075,
70
+ "grad_norm": 3.6231133937835693,
71
+ "learning_rate": 5.333333333333333e-05,
72
+ "loss": 2.0493,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 2.6153846153846154,
77
+ "grad_norm": 2.6322238445281982,
78
+ "learning_rate": 6e-05,
79
+ "loss": 1.3266,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 2.9230769230769234,
84
+ "grad_norm": 3.063889265060425,
85
+ "learning_rate": 6.666666666666667e-05,
86
+ "loss": 1.9172,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 3.0,
91
+ "grad_norm": 6.935006141662598,
92
+ "learning_rate": 7.333333333333333e-05,
93
+ "loss": 1.6738,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 3.3076923076923075,
98
+ "grad_norm": 3.876340866088867,
99
+ "learning_rate": 8e-05,
100
+ "loss": 1.659,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 3.6153846153846154,
105
+ "grad_norm": 3.144028663635254,
106
+ "learning_rate": 8.666666666666667e-05,
107
+ "loss": 1.6267,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 3.9230769230769234,
112
+ "grad_norm": 2.40108060836792,
113
+ "learning_rate": 9.333333333333334e-05,
114
+ "loss": 0.9194,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 4.0,
119
+ "grad_norm": 13.895914077758789,
120
+ "learning_rate": 0.0001,
121
+ "loss": 1.5071,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 4.3076923076923075,
126
+ "grad_norm": 3.016589879989624,
127
+ "learning_rate": 9.998646205897309e-05,
128
+ "loss": 0.6631,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 4.615384615384615,
133
+ "grad_norm": 3.7720727920532227,
134
+ "learning_rate": 9.994585556692624e-05,
135
+ "loss": 1.0356,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 4.923076923076923,
140
+ "grad_norm": 3.657879114151001,
141
+ "learning_rate": 9.987820251299122e-05,
142
+ "loss": 1.1876,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 5.0,
147
+ "grad_norm": 6.603457450866699,
148
+ "learning_rate": 9.978353953249022e-05,
149
+ "loss": 1.0117,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 5.3076923076923075,
154
+ "grad_norm": 3.7631995677948,
155
+ "learning_rate": 9.966191788709716e-05,
156
+ "loss": 0.8773,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 5.615384615384615,
161
+ "grad_norm": 2.5895802974700928,
162
+ "learning_rate": 9.951340343707852e-05,
163
+ "loss": 0.524,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 5.923076923076923,
168
+ "grad_norm": 3.9228901863098145,
169
+ "learning_rate": 9.933807660562898e-05,
170
+ "loss": 0.639,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 6.0,
175
+ "grad_norm": 10.968963623046875,
176
+ "learning_rate": 9.913603233532067e-05,
177
+ "loss": 0.5014,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 6.3076923076923075,
182
+ "grad_norm": 3.60386061668396,
183
+ "learning_rate": 9.890738003669029e-05,
184
+ "loss": 0.4728,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 6.615384615384615,
189
+ "grad_norm": 3.794558525085449,
190
+ "learning_rate": 9.865224352899119e-05,
191
+ "loss": 0.4594,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 6.923076923076923,
196
+ "grad_norm": 3.044400691986084,
197
+ "learning_rate": 9.837076097314319e-05,
198
+ "loss": 0.3733,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 7.0,
203
+ "grad_norm": 10.606956481933594,
204
+ "learning_rate": 9.806308479691595e-05,
205
+ "loss": 0.2499,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 7.3076923076923075,
210
+ "grad_norm": 2.988124132156372,
211
+ "learning_rate": 9.77293816123866e-05,
212
+ "loss": 0.2711,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 7.615384615384615,
217
+ "grad_norm": 3.3052446842193604,
218
+ "learning_rate": 9.736983212571646e-05,
219
+ "loss": 0.3466,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 7.923076923076923,
224
+ "grad_norm": 2.315810441970825,
225
+ "learning_rate": 9.698463103929542e-05,
226
+ "loss": 0.185,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 8.0,
231
+ "grad_norm": 8.904762268066406,
232
+ "learning_rate": 9.657398694630712e-05,
233
+ "loss": 0.0841,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 8.307692307692308,
238
+ "grad_norm": 2.4080655574798584,
239
+ "learning_rate": 9.613812221777212e-05,
240
+ "loss": 0.155,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 8.615384615384615,
245
+ "grad_norm": 2.4143807888031006,
246
+ "learning_rate": 9.567727288213005e-05,
247
+ "loss": 0.1114,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 8.923076923076923,
252
+ "grad_norm": 2.5390758514404297,
253
+ "learning_rate": 9.519168849742604e-05,
254
+ "loss": 0.1484,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 9.0,
259
+ "grad_norm": 4.964494705200195,
260
+ "learning_rate": 9.468163201617062e-05,
261
+ "loss": 0.1483,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 9.307692307692308,
266
+ "grad_norm": 2.1613352298736572,
267
+ "learning_rate": 9.414737964294636e-05,
268
+ "loss": 0.0918,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 9.615384615384615,
273
+ "grad_norm": 2.3066835403442383,
274
+ "learning_rate": 9.358922068483812e-05,
275
+ "loss": 0.094,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 9.923076923076923,
280
+ "grad_norm": 1.2779909372329712,
281
+ "learning_rate": 9.300745739476829e-05,
282
+ "loss": 0.0516,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 10.0,
287
+ "grad_norm": 2.4075942039489746,
288
+ "learning_rate": 9.24024048078213e-05,
289
+ "loss": 0.0477,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 10.307692307692308,
294
+ "grad_norm": 1.0170810222625732,
295
+ "learning_rate": 9.177439057064683e-05,
296
+ "loss": 0.0312,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 10.615384615384615,
301
+ "grad_norm": 2.0403945446014404,
302
+ "learning_rate": 9.112375476403312e-05,
303
+ "loss": 0.0624,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 10.923076923076923,
308
+ "grad_norm": 0.8645662665367126,
309
+ "learning_rate": 9.045084971874738e-05,
310
+ "loss": 0.0262,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 11.0,
315
+ "grad_norm": 5.61713171005249,
316
+ "learning_rate": 8.97560398247424e-05,
317
+ "loss": 0.0711,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 11.307692307692308,
322
+ "grad_norm": 0.8560450673103333,
323
+ "learning_rate": 8.903970133383297e-05,
324
+ "loss": 0.0232,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 11.615384615384615,
329
+ "grad_norm": 1.1219335794448853,
330
+ "learning_rate": 8.83022221559489e-05,
331
+ "loss": 0.0251,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 11.923076923076923,
336
+ "grad_norm": 1.4238826036453247,
337
+ "learning_rate": 8.754400164907497e-05,
338
+ "loss": 0.0402,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 12.0,
343
+ "grad_norm": 0.6608924269676208,
344
+ "learning_rate": 8.676545040299145e-05,
345
+ "loss": 0.0029,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 12.307692307692308,
350
+ "grad_norm": 2.1574368476867676,
351
+ "learning_rate": 8.596699001693255e-05,
352
+ "loss": 0.014,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 12.615384615384615,
357
+ "grad_norm": 0.9417891502380371,
358
+ "learning_rate": 8.51490528712831e-05,
359
+ "loss": 0.0173,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 12.923076923076923,
364
+ "grad_norm": 1.281510353088379,
365
+ "learning_rate": 8.43120818934367e-05,
366
+ "loss": 0.0275,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 13.0,
371
+ "grad_norm": 2.0283124446868896,
372
+ "learning_rate": 8.345653031794292e-05,
373
+ "loss": 0.0311,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 13.307692307692308,
378
+ "grad_norm": 0.5412298440933228,
379
+ "learning_rate": 8.258286144107276e-05,
380
+ "loss": 0.0094,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 13.615384615384615,
385
+ "grad_norm": 0.9629742503166199,
386
+ "learning_rate": 8.169154836993551e-05,
387
+ "loss": 0.0144,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 13.923076923076923,
392
+ "grad_norm": 0.6250266432762146,
393
+ "learning_rate": 8.07830737662829e-05,
394
+ "loss": 0.0121,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 14.0,
399
+ "grad_norm": 2.4746816158294678,
400
+ "learning_rate": 7.985792958513931e-05,
401
+ "loss": 0.0212,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 14.307692307692308,
406
+ "grad_norm": 0.6302680373191833,
407
+ "learning_rate": 7.891661680839932e-05,
408
+ "loss": 0.0092,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 14.615384615384615,
413
+ "grad_norm": 0.462685227394104,
414
+ "learning_rate": 7.795964517353735e-05,
415
+ "loss": 0.0066,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 14.923076923076923,
420
+ "grad_norm": 0.8047600388526917,
421
+ "learning_rate": 7.698753289757565e-05,
422
+ "loss": 0.0125,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 15.0,
427
+ "grad_norm": 1.3532655239105225,
428
+ "learning_rate": 7.600080639646077e-05,
429
+ "loss": 0.0094,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 15.307692307692308,
434
+ "grad_norm": 0.8325954079627991,
435
+ "learning_rate": 7.500000000000001e-05,
436
+ "loss": 0.0078,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 15.615384615384615,
441
+ "grad_norm": 0.5373042225837708,
442
+ "learning_rate": 7.398565566251232e-05,
443
+ "loss": 0.0065,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 15.923076923076923,
448
+ "grad_norm": 0.6049899458885193,
449
+ "learning_rate": 7.295832266935059e-05,
450
+ "loss": 0.0072,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 16.0,
455
+ "grad_norm": 0.6376262903213501,
456
+ "learning_rate": 7.191855733945387e-05,
457
+ "loss": 0.0061,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 16.307692307692307,
462
+ "grad_norm": 0.4146476089954376,
463
+ "learning_rate": 7.08669227240909e-05,
464
+ "loss": 0.0067,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 16.615384615384617,
469
+ "grad_norm": 0.901756763458252,
470
+ "learning_rate": 6.980398830195785e-05,
471
+ "loss": 0.0111,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 16.923076923076923,
476
+ "grad_norm": 0.1660463660955429,
477
+ "learning_rate": 6.873032967079561e-05,
478
+ "loss": 0.0031,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 17.0,
483
+ "grad_norm": 1.6091344356536865,
484
+ "learning_rate": 6.764652823569344e-05,
485
+ "loss": 0.0114,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 17.307692307692307,
490
+ "grad_norm": 0.17023883759975433,
491
+ "learning_rate": 6.65531708942479e-05,
492
+ "loss": 0.0028,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 17.615384615384617,
497
+ "grad_norm": 1.0435467958450317,
498
+ "learning_rate": 6.545084971874738e-05,
499
+ "loss": 0.0096,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 17.923076923076923,
504
+ "grad_norm": 0.6580948233604431,
505
+ "learning_rate": 6.434016163555452e-05,
506
+ "loss": 0.006,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 18.0,
511
+ "grad_norm": 1.8953274488449097,
512
+ "learning_rate": 6.322170810186012e-05,
513
+ "loss": 0.0119,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 18.307692307692307,
518
+ "grad_norm": 0.39621683955192566,
519
+ "learning_rate": 6.209609477998338e-05,
520
+ "loss": 0.0042,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 18.615384615384617,
525
+ "grad_norm": 0.3594362437725067,
526
+ "learning_rate": 6.096393120939516e-05,
527
  "loss": 0.0042,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 18.923076923076923,
532
+ "grad_norm": 0.8800605535507202,
533
+ "learning_rate": 5.982583047664151e-05,
534
+ "loss": 0.0096,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 19.0,
539
+ "grad_norm": 0.41055458784103394,
540
+ "learning_rate": 5.868240888334653e-05,
541
+ "loss": 0.0036,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 19.307692307692307,
546
+ "grad_norm": 0.3029349446296692,
547
+ "learning_rate": 5.753428561247416e-05,
548
+ "loss": 0.0026,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 19.615384615384617,
553
+ "grad_norm": 0.5970585346221924,
554
+ "learning_rate": 5.6382082393029746e-05,
555
+ "loss": 0.0078,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 19.923076923076923,
560
+ "grad_norm": 0.27087146043777466,
561
+ "learning_rate": 5.522642316338268e-05,
562
+ "loss": 0.0039,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 20.0,
567
+ "grad_norm": 0.8190112113952637,
568
+ "learning_rate": 5.4067933733392915e-05,
569
+ "loss": 0.0055,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 20.307692307692307,
574
+ "grad_norm": 0.19041061401367188,
575
+ "learning_rate": 5.290724144552379e-05,
576
+ "loss": 0.0016,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 20.615384615384617,
581
+ "grad_norm": 0.7743979692459106,
582
+ "learning_rate": 5.174497483512506e-05,
583
+ "loss": 0.0045,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 20.923076923076923,
588
+ "grad_norm": 0.21607236564159393,
589
+ "learning_rate": 5.0581763290069865e-05,
590
+ "loss": 0.0035,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 21.0,
595
+ "grad_norm": 1.6706500053405762,
596
+ "learning_rate": 4.941823670993016e-05,
597
+ "loss": 0.0133,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 21.307692307692307,
602
+ "grad_norm": 0.48242342472076416,
603
+ "learning_rate": 4.825502516487497e-05,
604
+ "loss": 0.0059,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 21.615384615384617,
609
+ "grad_norm": 0.11222591251134872,
610
+ "learning_rate": 4.709275855447621e-05,
611
+ "loss": 0.0025,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 21.923076923076923,
616
+ "grad_norm": 0.22460472583770752,
617
+ "learning_rate": 4.593206626660709e-05,
618
+ "loss": 0.0033,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 22.0,
623
+ "grad_norm": 0.39556336402893066,
624
+ "learning_rate": 4.477357683661734e-05,
625
+ "loss": 0.0024,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 22.307692307692307,
630
+ "grad_norm": 0.2808006703853607,
631
+ "learning_rate": 4.361791760697027e-05,
632
+ "loss": 0.0028,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 22.615384615384617,
637
+ "grad_norm": 0.3281514048576355,
638
+ "learning_rate": 4.246571438752585e-05,
639
+ "loss": 0.0029,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 22.923076923076923,
644
+ "grad_norm": 0.4327070116996765,
645
+ "learning_rate": 4.131759111665349e-05,
646
  "loss": 0.0023,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 23.0,
651
+ "grad_norm": 0.6825558543205261,
652
+ "learning_rate": 4.017416952335849e-05,
653
+ "loss": 0.0038,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 23.307692307692307,
658
+ "grad_norm": 0.5522281527519226,
659
+ "learning_rate": 3.903606879060483e-05,
660
+ "loss": 0.002,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 23.615384615384617,
665
+ "grad_norm": 0.06093262881040573,
666
+ "learning_rate": 3.790390522001662e-05,
667
+ "loss": 0.002,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 23.923076923076923,
672
+ "grad_norm": 0.1554577350616455,
673
+ "learning_rate": 3.67782918981399e-05,
674
+ "loss": 0.0021,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 24.0,
679
+ "grad_norm": 0.5829846858978271,
680
+ "learning_rate": 3.5659838364445505e-05,
681
+ "loss": 0.0031,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 24.307692307692307,
686
+ "grad_norm": 0.10798896104097366,
687
+ "learning_rate": 3.4549150281252636e-05,
688
  "loss": 0.0017,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 24.615384615384617,
693
+ "grad_norm": 0.0642886683344841,
694
+ "learning_rate": 3.34468291057521e-05,
695
  "loss": 0.0019,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 24.923076923076923,
700
+ "grad_norm": 0.08898573368787766,
701
+ "learning_rate": 3.235347176430656e-05,
702
+ "loss": 0.0016,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 25.0,
707
+ "grad_norm": 2.9850921630859375,
708
+ "learning_rate": 3.12696703292044e-05,
709
+ "loss": 0.0109,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 25.307692307692307,
714
+ "grad_norm": 0.15037357807159424,
715
+ "learning_rate": 3.019601169804216e-05,
716
  "loss": 0.0012,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 25.615384615384617,
721
+ "grad_norm": 0.45115193724632263,
722
+ "learning_rate": 2.9133077275909108e-05,
723
+ "loss": 0.0029,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 25.923076923076923,
728
+ "grad_norm": 0.12441351264715195,
729
+ "learning_rate": 2.8081442660546125e-05,
730
+ "loss": 0.0021,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 26.0,
735
+ "grad_norm": 0.03845607116818428,
736
+ "learning_rate": 2.7041677330649407e-05,
737
+ "loss": 0.0014,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 26.307692307692307,
742
+ "grad_norm": 0.3710884153842926,
743
+ "learning_rate": 2.6014344337487707e-05,
744
+ "loss": 0.0024,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 26.615384615384617,
749
+ "grad_norm": 0.0669671967625618,
750
+ "learning_rate": 2.500000000000001e-05,
751
+ "loss": 0.0016,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 26.923076923076923,
756
+ "grad_norm": 0.0413970872759819,
757
+ "learning_rate": 2.399919360353923e-05,
758
+ "loss": 0.0016,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 27.0,
763
+ "grad_norm": 0.08645425736904144,
764
+ "learning_rate": 2.3012467102424373e-05,
765
  "loss": 0.0014,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 27.307692307692307,
770
+ "grad_norm": 0.18081574141979218,
771
+ "learning_rate": 2.2040354826462668e-05,
772
+ "loss": 0.0011,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 27.615384615384617,
777
+ "grad_norm": 0.06019139289855957,
778
+ "learning_rate": 2.1083383191600674e-05,
779
+ "loss": 0.0017,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 27.923076923076923,
784
+ "grad_norm": 0.03454792872071266,
785
+ "learning_rate": 2.0142070414860704e-05,
786
  "loss": 0.0014,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 28.0,
791
+ "grad_norm": 0.08351138234138489,
792
+ "learning_rate": 1.9216926233717085e-05,
793
+ "loss": 0.0013,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 28.307692307692307,
798
+ "grad_norm": 0.02913900464773178,
799
+ "learning_rate": 1.8308451630064484e-05,
800
  "loss": 0.0014,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 28.615384615384617,
805
+ "grad_norm": 0.03412294760346413,
806
+ "learning_rate": 1.7417138558927244e-05,
807
+ "loss": 0.0012,
808
  "step": 114
809
  },
810
  {
811
  "epoch": 28.923076923076923,
812
+ "grad_norm": 0.0328608974814415,
813
+ "learning_rate": 1.6543469682057106e-05,
814
+ "loss": 0.0013,
815
  "step": 115
816
  },
817
  {
818
  "epoch": 29.0,
819
+ "grad_norm": 0.050466686487197876,
820
+ "learning_rate": 1.5687918106563326e-05,
821
  "loss": 0.0011,
822
  "step": 116
823
  },
824
  {
825
  "epoch": 29.307692307692307,
826
+ "grad_norm": 0.02442150004208088,
827
+ "learning_rate": 1.4850947128716913e-05,
828
+ "loss": 0.001,
829
  "step": 117
830
  },
831
  {
832
  "epoch": 29.615384615384617,
833
+ "grad_norm": 0.027474530041217804,
834
+ "learning_rate": 1.4033009983067452e-05,
835
+ "loss": 0.0015,
836
  "step": 118
837
  },
838
  {
839
  "epoch": 29.923076923076923,
840
+ "grad_norm": 0.024797696620225906,
841
+ "learning_rate": 1.3234549597008571e-05,
842
  "loss": 0.0013,
843
  "step": 119
844
  },
845
  {
846
  "epoch": 30.0,
847
+ "grad_norm": 0.029813682660460472,
848
+ "learning_rate": 1.245599835092504e-05,
849
+ "loss": 0.0012,
850
  "step": 120
851
  },
852
  {
853
  "epoch": 30.307692307692307,
854
+ "grad_norm": 0.01790359988808632,
855
+ "learning_rate": 1.1697777844051105e-05,
856
+ "loss": 0.001,
857
  "step": 121
858
  },
859
  {
860
  "epoch": 30.615384615384617,
861
+ "grad_norm": 0.021541791036725044,
862
+ "learning_rate": 1.096029866616704e-05,
863
  "loss": 0.0012,
864
  "step": 122
865
  },
866
  {
867
  "epoch": 30.923076923076923,
868
+ "grad_norm": 0.023409120738506317,
869
+ "learning_rate": 1.0243960175257606e-05,
870
  "loss": 0.0011,
871
  "step": 123
872
  },
873
  {
874
  "epoch": 31.0,
875
+ "grad_norm": 0.04634522646665573,
876
+ "learning_rate": 9.549150281252633e-06,
877
+ "loss": 0.0016,
878
  "step": 124
879
  },
880
  {
881
  "epoch": 31.307692307692307,
882
+ "grad_norm": 0.0230031069368124,
883
+ "learning_rate": 8.876245235966885e-06,
884
+ "loss": 0.0012,
885
  "step": 125
886
  },
887
  {
888
  "epoch": 31.615384615384617,
889
+ "grad_norm": 0.050953831523656845,
890
+ "learning_rate": 8.225609429353187e-06,
891
+ "loss": 0.001,
892
  "step": 126
893
  },
894
  {
895
  "epoch": 31.923076923076923,
896
+ "grad_norm": 0.03163566812872887,
897
+ "learning_rate": 7.597595192178702e-06,
898
+ "loss": 0.0012,
899
  "step": 127
900
  },
901
  {
902
  "epoch": 32.0,
903
+ "grad_norm": 0.03247598558664322,
904
+ "learning_rate": 6.992542605231739e-06,
905
+ "loss": 0.0014,
906
  "step": 128
907
  },
908
  {
909
  "epoch": 32.30769230769231,
910
+ "grad_norm": 0.013677124865353107,
911
+ "learning_rate": 6.410779315161886e-06,
912
+ "loss": 0.0009,
913
  "step": 129
914
  },
915
  {
916
  "epoch": 32.61538461538461,
917
+ "grad_norm": 0.026261860504746437,
918
+ "learning_rate": 5.852620357053651e-06,
919
+ "loss": 0.0011,
920
  "step": 130
921
  },
922
  {
923
  "epoch": 32.92307692307692,
924
+ "grad_norm": 0.021554652601480484,
925
+ "learning_rate": 5.318367983829392e-06,
926
+ "loss": 0.0014,
927
  "step": 131
928
  },
929
  {
930
  "epoch": 33.0,
931
+ "grad_norm": 0.048853594809770584,
932
+ "learning_rate": 4.8083115025739756e-06,
933
+ "loss": 0.0009,
934
  "step": 132
935
  },
936
  {
937
  "epoch": 33.30769230769231,
938
+ "grad_norm": 0.1052127406001091,
939
+ "learning_rate": 4.322727117869951e-06,
940
+ "loss": 0.0013,
941
  "step": 133
942
  },
943
  {
944
  "epoch": 33.61538461538461,
945
+ "grad_norm": 0.014833934605121613,
946
+ "learning_rate": 3.861877782227885e-06,
947
+ "loss": 0.0009,
948
  "step": 134
949
  },
950
  {
951
  "epoch": 33.92307692307692,
952
+ "grad_norm": 0.022332781925797462,
953
+ "learning_rate": 3.426013053692878e-06,
954
+ "loss": 0.0012,
955
  "step": 135
956
  },
957
  {
958
  "epoch": 34.0,
959
+ "grad_norm": 0.034213513135910034,
960
+ "learning_rate": 3.0153689607045845e-06,
961
+ "loss": 0.001,
962
  "step": 136
963
  },
964
  {
965
  "epoch": 34.30769230769231,
966
+ "grad_norm": 0.020968670025467873,
967
+ "learning_rate": 2.63016787428354e-06,
968
+ "loss": 0.0013,
969
  "step": 137
970
  },
971
  {
972
  "epoch": 34.61538461538461,
973
+ "grad_norm": 0.013379962183535099,
974
+ "learning_rate": 2.2706183876134045e-06,
975
+ "loss": 0.0007,
976
  "step": 138
977
  },
978
  {
979
  "epoch": 34.92307692307692,
980
+ "grad_norm": 0.020913399755954742,
981
+ "learning_rate": 1.9369152030840556e-06,
982
+ "loss": 0.0011,
983
  "step": 139
984
  },
985
  {
986
  "epoch": 35.0,
987
+ "grad_norm": 0.032495055347681046,
988
+ "learning_rate": 1.6292390268568104e-06,
989
+ "loss": 0.0014,
990
  "step": 140
991
  },
992
  {
993
  "epoch": 35.30769230769231,
994
+ "grad_norm": 0.01754385605454445,
995
+ "learning_rate": 1.3477564710088098e-06,
996
+ "loss": 0.0011,
997
  "step": 141
998
  },
999
  {
1000
  "epoch": 35.61538461538461,
1001
+ "grad_norm": 0.021996742114424706,
1002
+ "learning_rate": 1.0926199633097157e-06,
1003
+ "loss": 0.0011,
1004
  "step": 142
1005
  },
1006
  {
1007
  "epoch": 35.92307692307692,
1008
+ "grad_norm": 0.013828999362885952,
1009
+ "learning_rate": 8.639676646793382e-07,
1010
+ "loss": 0.0011,
1011
  "step": 143
1012
  },
1013
  {
1014
  "epoch": 36.0,
1015
+ "grad_norm": 0.0670580044388771,
1016
+ "learning_rate": 6.61923394371039e-07,
1017
+ "loss": 0.0012,
1018
  "step": 144
1019
  },
1020
  {
1021
  "epoch": 36.30769230769231,
1022
+ "grad_norm": 0.01171192154288292,
1023
+ "learning_rate": 4.865965629214819e-07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1024
  "loss": 0.0008,
1025
+ "step": 145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1026
  }
1027
  ],
1028
  "logging_steps": 1,
1029
+ "max_steps": 150,
1030
  "num_input_tokens_seen": 0,
1031
+ "num_train_epochs": 38,
1032
+ "save_steps": 5,
1033
  "stateful_callbacks": {
1034
  "TrainerControl": {
1035
  "args": {
 
1042
  "attributes": {}
1043
  }
1044
  },
1045
+ "total_flos": 4.279650186790502e+16,
1046
  "train_batch_size": 2,
1047
  "trial_name": null,
1048
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d0602db10057a2c9b0fdf15117d239f18bc4bbff9fed96a819f068221c63f8a
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f19b144ff6256052340a4caed3d3358b46d4b6c87b9238f7d8f792f8dda85ea
3
  size 6033