nt-van-khanh commited on
Commit
865469d
·
verified ·
1 Parent(s): e8fa522

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "gate_proj",
34
- "up_proj",
35
  "down_proj",
36
- "o_proj",
 
 
37
  "k_proj",
38
- "q_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "down_proj",
33
+ "q_proj",
34
+ "up_proj",
35
+ "v_proj",
36
  "k_proj",
37
+ "o_proj",
38
+ "gate_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e39c5190b449964d6320622caa9cedc9fbf478ef850d1b88a909ea119aebf579
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83080fb8e776556dccbd8566a149036b04b680b1f7df6595087306d141fdf23b
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:872ae2fbee06480034ccbfa235079202342578802cf8fff7f3dd693c59d9bba0
3
- size 335812922
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61f579989cf726fb56ac8be5726f4f2a782044d114d09e570f66648d859e60f2
3
+ size 85733206
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cebaa66b205ddfb8b31f5eb5a32b25b2bf5b2b20793922b5ca0b959d1b26d3ec
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebd8221843f67894d180101c2a23325b58062bbda224ad38cb35fd657d1b50d6
3
  size 14244
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac4a558c5b93581a5c41e2922404490319bcd15a10296c927a72a41012ff7f27
3
+ size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dce1dba4991ed6328501072e92b13c1d75d1f773bde8a373c913b4ccf6b6c264
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0587f905a8c9b2be59055a995476e795d7c0ee0915ef354e4295be6940ca474
3
  size 1064
trainer_state.json CHANGED
@@ -2,216 +2,216 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0560695262125035,
6
- "eval_steps": 150,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.590625,
14
- "epoch": 0.0028034763106251754,
15
- "grad_norm": 0.9896485805511475,
16
  "learning_rate": 0.00016,
17
- "loss": 2.0673,
18
- "mean_token_accuracy": 0.5922202587127685,
19
- "num_tokens": 4193.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 2.00361328125,
24
- "epoch": 0.005606952621250351,
25
- "grad_norm": 0.9075812101364136,
26
- "learning_rate": 0.0001989261744966443,
27
- "loss": 1.9308,
28
- "mean_token_accuracy": 0.6079224228858948,
29
- "num_tokens": 8218.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.76513671875,
34
- "epoch": 0.008410428931875526,
35
- "grad_norm": 0.9663995504379272,
36
- "learning_rate": 0.00019758389261744966,
37
- "loss": 1.482,
38
- "mean_token_accuracy": 0.6631967008113862,
39
- "num_tokens": 14014.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.4521484375,
44
- "epoch": 0.011213905242500702,
45
- "grad_norm": 0.6758019328117371,
46
- "learning_rate": 0.00019624161073825505,
47
- "loss": 1.4712,
48
- "mean_token_accuracy": 0.6659617304801941,
49
- "num_tokens": 19584.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.56689453125,
54
- "epoch": 0.014017381553125876,
55
- "grad_norm": 0.9318982362747192,
56
- "learning_rate": 0.0001948993288590604,
57
- "loss": 1.3829,
58
- "mean_token_accuracy": 0.6853179931640625,
59
- "num_tokens": 24194.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.5326171875,
64
- "epoch": 0.01682085786375105,
65
- "grad_norm": 0.934631884098053,
66
- "learning_rate": 0.0001935570469798658,
67
- "loss": 1.4176,
68
- "mean_token_accuracy": 0.6775492548942565,
69
- "num_tokens": 28392.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.56787109375,
74
- "epoch": 0.019624334174376225,
75
- "grad_norm": 1.0512644052505493,
76
- "learning_rate": 0.00019221476510067115,
77
- "loss": 1.4598,
78
- "mean_token_accuracy": 0.6650450527667999,
79
- "num_tokens": 34574.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.63740234375,
84
- "epoch": 0.022427810485001403,
85
- "grad_norm": 0.8699678182601929,
86
- "learning_rate": 0.00019087248322147653,
87
- "loss": 1.515,
88
- "mean_token_accuracy": 0.6554247856140136,
89
- "num_tokens": 40558.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.60029296875,
94
- "epoch": 0.025231286795626577,
95
- "grad_norm": 0.7725480198860168,
96
- "learning_rate": 0.0001895302013422819,
97
- "loss": 1.4544,
98
- "mean_token_accuracy": 0.6661459267139435,
99
- "num_tokens": 46416.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.53515625,
104
- "epoch": 0.02803476310625175,
105
- "grad_norm": 1.1531834602355957,
106
- "learning_rate": 0.00018818791946308724,
107
- "loss": 1.3171,
108
- "mean_token_accuracy": 0.6909306168556213,
109
- "num_tokens": 50005.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.40400390625,
114
- "epoch": 0.03083823941687693,
115
- "grad_norm": 0.8206179738044739,
116
- "learning_rate": 0.00018684563758389263,
117
- "loss": 1.3265,
118
- "mean_token_accuracy": 0.6994331538677215,
119
- "num_tokens": 54550.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.37587890625,
124
- "epoch": 0.0336417157275021,
125
- "grad_norm": 0.9528921246528625,
126
- "learning_rate": 0.00018550335570469799,
127
- "loss": 1.2327,
128
- "mean_token_accuracy": 0.6945347368717194,
129
- "num_tokens": 58738.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.498828125,
134
- "epoch": 0.03644519203812728,
135
- "grad_norm": 0.7587556838989258,
136
- "learning_rate": 0.00018416107382550337,
137
- "loss": 1.3617,
138
- "mean_token_accuracy": 0.6782361149787903,
139
- "num_tokens": 62775.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.47353515625,
144
- "epoch": 0.03924866834875245,
145
- "grad_norm": 0.817040741443634,
146
- "learning_rate": 0.00018281879194630873,
147
- "loss": 1.3694,
148
- "mean_token_accuracy": 0.6701848864555359,
149
- "num_tokens": 68528.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.48798828125,
154
- "epoch": 0.04205214465937763,
155
- "grad_norm": 0.9585862159729004,
156
- "learning_rate": 0.0001814765100671141,
157
- "loss": 1.3025,
158
- "mean_token_accuracy": 0.6777489185333252,
159
- "num_tokens": 72845.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.509765625,
164
- "epoch": 0.044855620970002806,
165
- "grad_norm": 0.9537568688392639,
166
- "learning_rate": 0.00018013422818791947,
167
- "loss": 1.4538,
168
- "mean_token_accuracy": 0.6687214255332947,
169
- "num_tokens": 77986.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.43916015625,
174
- "epoch": 0.04765909728062798,
175
- "grad_norm": 0.6577709913253784,
176
- "learning_rate": 0.00017879194630872485,
177
- "loss": 1.2956,
178
- "mean_token_accuracy": 0.6882057845592499,
179
- "num_tokens": 83043.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.31279296875,
184
- "epoch": 0.050462573591253154,
185
- "grad_norm": 1.0970784425735474,
186
- "learning_rate": 0.0001774496644295302,
187
- "loss": 1.2447,
188
- "mean_token_accuracy": 0.709147822856903,
189
- "num_tokens": 86893.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.53671875,
194
- "epoch": 0.05326604990187833,
195
- "grad_norm": 0.8757184147834778,
196
- "learning_rate": 0.0001761073825503356,
197
- "loss": 1.4292,
198
- "mean_token_accuracy": 0.6783790171146393,
199
- "num_tokens": 91659.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.52294921875,
204
- "epoch": 0.0560695262125035,
205
- "grad_norm": 0.858314037322998,
206
- "learning_rate": 0.00017476510067114095,
207
- "loss": 1.4,
208
- "mean_token_accuracy": 0.6753414094448089,
209
- "num_tokens": 95692.0,
210
  "step": 100
211
  }
212
  ],
213
  "logging_steps": 5,
214
- "max_steps": 750,
215
  "num_input_tokens_seen": 0,
216
  "num_train_epochs": 1,
217
  "save_steps": 100,
@@ -227,7 +227,7 @@
227
  "attributes": {}
228
  }
229
  },
230
- "total_flos": 4333049190580224.0,
231
  "train_batch_size": 1,
232
  "trial_name": null,
233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.112139052425007,
6
+ "eval_steps": 179,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.696909672021866,
14
+ "epoch": 0.005606952621250351,
15
+ "grad_norm": 0.7663310170173645,
16
  "learning_rate": 0.00016,
17
+ "loss": 2.1596,
18
+ "mean_token_accuracy": 0.5812449663877487,
19
+ "num_tokens": 8218.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.875484037399292,
24
+ "epoch": 0.011213905242500702,
25
+ "grad_norm": 0.8613530397415161,
26
+ "learning_rate": 0.00019909808342728297,
27
+ "loss": 1.6298,
28
+ "mean_token_accuracy": 0.6346893429756164,
29
+ "num_tokens": 19584.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.6492740571498872,
34
+ "epoch": 0.01682085786375105,
35
+ "grad_norm": 0.8438306450843811,
36
+ "learning_rate": 0.0001979706877113867,
37
+ "loss": 1.448,
38
+ "mean_token_accuracy": 0.667498791217804,
39
+ "num_tokens": 28392.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.5424207150936127,
44
+ "epoch": 0.022427810485001403,
45
+ "grad_norm": 0.5755366086959839,
46
+ "learning_rate": 0.00019684329199549043,
47
+ "loss": 1.5714,
48
+ "mean_token_accuracy": 0.6525402277708053,
49
+ "num_tokens": 40558.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.6563467800617218,
54
+ "epoch": 0.02803476310625175,
55
+ "grad_norm": 0.640796422958374,
56
+ "learning_rate": 0.00019571589627959414,
57
+ "loss": 1.4843,
58
+ "mean_token_accuracy": 0.6737909287214279,
59
+ "num_tokens": 50005.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.477371919155121,
64
+ "epoch": 0.0336417157275021,
65
+ "grad_norm": 0.7493678331375122,
66
+ "learning_rate": 0.00019458850056369787,
67
+ "loss": 1.3474,
68
+ "mean_token_accuracy": 0.6935703039169312,
69
+ "num_tokens": 58738.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.464887660741806,
74
+ "epoch": 0.03924866834875245,
75
+ "grad_norm": 0.6396933794021606,
76
+ "learning_rate": 0.00019346110484780158,
77
+ "loss": 1.3963,
78
+ "mean_token_accuracy": 0.6689792603254319,
79
+ "num_tokens": 68528.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.4757700502872466,
84
+ "epoch": 0.044855620970002806,
85
+ "grad_norm": 0.5516763925552368,
86
+ "learning_rate": 0.0001923337091319053,
87
+ "loss": 1.4236,
88
+ "mean_token_accuracy": 0.6675747632980347,
89
+ "num_tokens": 77986.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.4118095993995667,
94
+ "epoch": 0.050462573591253154,
95
+ "grad_norm": 0.6395580172538757,
96
+ "learning_rate": 0.00019120631341600902,
97
+ "loss": 1.2766,
98
+ "mean_token_accuracy": 0.6935016334056854,
99
+ "num_tokens": 86893.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.4825987100601197,
104
+ "epoch": 0.0560695262125035,
105
+ "grad_norm": 0.7649742960929871,
106
+ "learning_rate": 0.00019007891770011275,
107
+ "loss": 1.4255,
108
+ "mean_token_accuracy": 0.6750761657953263,
109
+ "num_tokens": 95692.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.3713403642177582,
114
+ "epoch": 0.06167647883375386,
115
+ "grad_norm": 0.6055657863616943,
116
+ "learning_rate": 0.00018895152198421646,
117
+ "loss": 1.2919,
118
+ "mean_token_accuracy": 0.6923940628767014,
119
+ "num_tokens": 104810.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.3107981920242309,
124
+ "epoch": 0.0672834314550042,
125
+ "grad_norm": 0.932307243347168,
126
+ "learning_rate": 0.0001878241262683202,
127
+ "loss": 1.2072,
128
+ "mean_token_accuracy": 0.7068106323480606,
129
+ "num_tokens": 112767.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.290432232618332,
134
+ "epoch": 0.07289038407625456,
135
+ "grad_norm": 0.657538115978241,
136
+ "learning_rate": 0.00018669673055242392,
137
+ "loss": 1.1683,
138
+ "mean_token_accuracy": 0.714971786737442,
139
+ "num_tokens": 122104.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.3310194253921508,
144
+ "epoch": 0.0784973366975049,
145
+ "grad_norm": 0.5447025299072266,
146
+ "learning_rate": 0.00018556933483652763,
147
+ "loss": 1.3566,
148
+ "mean_token_accuracy": 0.6949202805757523,
149
+ "num_tokens": 132347.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.3567017048597336,
154
+ "epoch": 0.08410428931875526,
155
+ "grad_norm": 0.6126067042350769,
156
+ "learning_rate": 0.00018444193912063134,
157
+ "loss": 1.2616,
158
+ "mean_token_accuracy": 0.6886427521705627,
159
+ "num_tokens": 140294.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.3231679052114487,
164
+ "epoch": 0.08971124194000561,
165
+ "grad_norm": 0.5827459096908569,
166
+ "learning_rate": 0.00018331454340473507,
167
+ "loss": 1.2312,
168
+ "mean_token_accuracy": 0.6998110383749008,
169
+ "num_tokens": 149796.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.3795920431613922,
174
+ "epoch": 0.09531819456125595,
175
+ "grad_norm": 0.6522558331489563,
176
+ "learning_rate": 0.0001821871476888388,
177
+ "loss": 1.3163,
178
+ "mean_token_accuracy": 0.6797463029623032,
179
+ "num_tokens": 160094.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.4354715049266815,
184
+ "epoch": 0.10092514718250631,
185
+ "grad_norm": 0.5437538623809814,
186
+ "learning_rate": 0.0001810597519729425,
187
+ "loss": 1.4219,
188
+ "mean_token_accuracy": 0.6741667121648789,
189
+ "num_tokens": 167520.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.3719047516584397,
194
+ "epoch": 0.10653209980375666,
195
+ "grad_norm": 0.6490810513496399,
196
+ "learning_rate": 0.00017993235625704624,
197
+ "loss": 1.3259,
198
+ "mean_token_accuracy": 0.69256811439991,
199
+ "num_tokens": 177631.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.381699651479721,
204
+ "epoch": 0.112139052425007,
205
+ "grad_norm": 0.6738480925559998,
206
+ "learning_rate": 0.00017880496054114995,
207
+ "loss": 1.3181,
208
+ "mean_token_accuracy": 0.696441325545311,
209
+ "num_tokens": 186948.0,
210
  "step": 100
211
  }
212
  ],
213
  "logging_steps": 5,
214
+ "max_steps": 892,
215
  "num_input_tokens_seen": 0,
216
  "num_train_epochs": 1,
217
  "save_steps": 100,
 
227
  "attributes": {}
228
  }
229
  },
230
+ "total_flos": 8465230950137856.0,
231
  "train_batch_size": 1,
232
  "trial_name": null,
233
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8792b8c492ea3ce975f08186d26490bce1a0e166ad9cf7331c22520bc661bf75
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d41fa02013525705cb7a82d4f608a53737fdbc7baa1d76305c242ebd4e870e
3
  size 5816