nt-van-khanh commited on
Commit
1aa03a8
·
verified ·
1 Parent(s): 865469d

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "down_proj",
33
- "q_proj",
34
- "up_proj",
35
  "v_proj",
36
- "k_proj",
 
 
37
  "o_proj",
38
- "gate_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
32
  "v_proj",
33
+ "gate_proj",
34
+ "up_proj",
35
+ "down_proj",
36
  "o_proj",
37
+ "k_proj",
38
+ "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83080fb8e776556dccbd8566a149036b04b680b1f7df6595087306d141fdf23b
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2b45559552d06fe1c78ca0665c4d8888961c58b9cf95a2b6b5d1c38613465d3
3
  size 167832240
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61f579989cf726fb56ac8be5726f4f2a782044d114d09e570f66648d859e60f2
3
  size 85733206
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:448d48b2fd3f7e8ac497ceb48185550a3df0e7bbe24f8e58e2540b35dac37fa9
3
  size 85733206
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebd8221843f67894d180101c2a23325b58062bbda224ad38cb35fd657d1b50d6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29cc7ab668b5b91e4d0d31bfeb0a9ac01024cb93ee27cea394110ef2ae77f6b5
3
  size 14244
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.112139052425007,
6
  "eval_steps": 179,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
@@ -10,203 +10,203 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "entropy": 1.696909672021866,
14
- "epoch": 0.005606952621250351,
15
- "grad_norm": 0.7663310170173645,
16
  "learning_rate": 0.00016,
17
- "loss": 2.1596,
18
- "mean_token_accuracy": 0.5812449663877487,
19
- "num_tokens": 8218.0,
20
  "step": 5
21
  },
22
  {
23
- "entropy": 1.875484037399292,
24
- "epoch": 0.011213905242500702,
25
- "grad_norm": 0.8613530397415161,
26
  "learning_rate": 0.00019909808342728297,
27
- "loss": 1.6298,
28
- "mean_token_accuracy": 0.6346893429756164,
29
- "num_tokens": 19584.0,
30
  "step": 10
31
  },
32
  {
33
- "entropy": 1.6492740571498872,
34
- "epoch": 0.01682085786375105,
35
- "grad_norm": 0.8438306450843811,
36
  "learning_rate": 0.0001979706877113867,
37
- "loss": 1.448,
38
- "mean_token_accuracy": 0.667498791217804,
39
- "num_tokens": 28392.0,
40
  "step": 15
41
  },
42
  {
43
- "entropy": 1.5424207150936127,
44
- "epoch": 0.022427810485001403,
45
- "grad_norm": 0.5755366086959839,
46
  "learning_rate": 0.00019684329199549043,
47
- "loss": 1.5714,
48
- "mean_token_accuracy": 0.6525402277708053,
49
- "num_tokens": 40558.0,
50
  "step": 20
51
  },
52
  {
53
- "entropy": 1.6563467800617218,
54
- "epoch": 0.02803476310625175,
55
- "grad_norm": 0.640796422958374,
56
  "learning_rate": 0.00019571589627959414,
57
- "loss": 1.4843,
58
- "mean_token_accuracy": 0.6737909287214279,
59
- "num_tokens": 50005.0,
60
  "step": 25
61
  },
62
  {
63
- "entropy": 1.477371919155121,
64
- "epoch": 0.0336417157275021,
65
- "grad_norm": 0.7493678331375122,
66
  "learning_rate": 0.00019458850056369787,
67
- "loss": 1.3474,
68
- "mean_token_accuracy": 0.6935703039169312,
69
- "num_tokens": 58738.0,
70
  "step": 30
71
  },
72
  {
73
- "entropy": 1.464887660741806,
74
- "epoch": 0.03924866834875245,
75
- "grad_norm": 0.6396933794021606,
76
  "learning_rate": 0.00019346110484780158,
77
- "loss": 1.3963,
78
- "mean_token_accuracy": 0.6689792603254319,
79
- "num_tokens": 68528.0,
80
  "step": 35
81
  },
82
  {
83
- "entropy": 1.4757700502872466,
84
- "epoch": 0.044855620970002806,
85
- "grad_norm": 0.5516763925552368,
86
  "learning_rate": 0.0001923337091319053,
87
- "loss": 1.4236,
88
- "mean_token_accuracy": 0.6675747632980347,
89
- "num_tokens": 77986.0,
90
  "step": 40
91
  },
92
  {
93
- "entropy": 1.4118095993995667,
94
- "epoch": 0.050462573591253154,
95
- "grad_norm": 0.6395580172538757,
96
  "learning_rate": 0.00019120631341600902,
97
- "loss": 1.2766,
98
- "mean_token_accuracy": 0.6935016334056854,
99
- "num_tokens": 86893.0,
100
  "step": 45
101
  },
102
  {
103
- "entropy": 1.4825987100601197,
104
- "epoch": 0.0560695262125035,
105
- "grad_norm": 0.7649742960929871,
106
  "learning_rate": 0.00019007891770011275,
107
- "loss": 1.4255,
108
- "mean_token_accuracy": 0.6750761657953263,
109
- "num_tokens": 95692.0,
110
  "step": 50
111
  },
112
  {
113
- "entropy": 1.3713403642177582,
114
- "epoch": 0.06167647883375386,
115
- "grad_norm": 0.6055657863616943,
116
  "learning_rate": 0.00018895152198421646,
117
- "loss": 1.2919,
118
- "mean_token_accuracy": 0.6923940628767014,
119
- "num_tokens": 104810.0,
120
  "step": 55
121
  },
122
  {
123
- "entropy": 1.3107981920242309,
124
- "epoch": 0.0672834314550042,
125
- "grad_norm": 0.932307243347168,
126
  "learning_rate": 0.0001878241262683202,
127
- "loss": 1.2072,
128
- "mean_token_accuracy": 0.7068106323480606,
129
- "num_tokens": 112767.0,
130
  "step": 60
131
  },
132
  {
133
- "entropy": 1.290432232618332,
134
- "epoch": 0.07289038407625456,
135
- "grad_norm": 0.657538115978241,
136
  "learning_rate": 0.00018669673055242392,
137
- "loss": 1.1683,
138
- "mean_token_accuracy": 0.714971786737442,
139
- "num_tokens": 122104.0,
140
  "step": 65
141
  },
142
  {
143
- "entropy": 1.3310194253921508,
144
- "epoch": 0.0784973366975049,
145
- "grad_norm": 0.5447025299072266,
146
  "learning_rate": 0.00018556933483652763,
147
- "loss": 1.3566,
148
- "mean_token_accuracy": 0.6949202805757523,
149
- "num_tokens": 132347.0,
150
  "step": 70
151
  },
152
  {
153
- "entropy": 1.3567017048597336,
154
- "epoch": 0.08410428931875526,
155
- "grad_norm": 0.6126067042350769,
156
  "learning_rate": 0.00018444193912063134,
157
- "loss": 1.2616,
158
- "mean_token_accuracy": 0.6886427521705627,
159
- "num_tokens": 140294.0,
160
  "step": 75
161
  },
162
  {
163
- "entropy": 1.3231679052114487,
164
- "epoch": 0.08971124194000561,
165
- "grad_norm": 0.5827459096908569,
166
  "learning_rate": 0.00018331454340473507,
167
- "loss": 1.2312,
168
- "mean_token_accuracy": 0.6998110383749008,
169
- "num_tokens": 149796.0,
170
  "step": 80
171
  },
172
  {
173
- "entropy": 1.3795920431613922,
174
- "epoch": 0.09531819456125595,
175
- "grad_norm": 0.6522558331489563,
176
  "learning_rate": 0.0001821871476888388,
177
- "loss": 1.3163,
178
- "mean_token_accuracy": 0.6797463029623032,
179
- "num_tokens": 160094.0,
180
  "step": 85
181
  },
182
  {
183
- "entropy": 1.4354715049266815,
184
- "epoch": 0.10092514718250631,
185
- "grad_norm": 0.5437538623809814,
186
  "learning_rate": 0.0001810597519729425,
187
- "loss": 1.4219,
188
- "mean_token_accuracy": 0.6741667121648789,
189
- "num_tokens": 167520.0,
190
  "step": 90
191
  },
192
  {
193
- "entropy": 1.3719047516584397,
194
- "epoch": 0.10653209980375666,
195
- "grad_norm": 0.6490810513496399,
196
  "learning_rate": 0.00017993235625704624,
197
- "loss": 1.3259,
198
- "mean_token_accuracy": 0.69256811439991,
199
- "num_tokens": 177631.0,
200
  "step": 95
201
  },
202
  {
203
- "entropy": 1.381699651479721,
204
- "epoch": 0.112139052425007,
205
- "grad_norm": 0.6738480925559998,
206
  "learning_rate": 0.00017880496054114995,
207
- "loss": 1.3181,
208
- "mean_token_accuracy": 0.696441325545311,
209
- "num_tokens": 186948.0,
210
  "step": 100
211
  }
212
  ],
@@ -227,7 +227,7 @@
227
  "attributes": {}
228
  }
229
  },
230
- "total_flos": 8465230950137856.0,
231
  "train_batch_size": 1,
232
  "trial_name": null,
233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.11217049915872125,
6
  "eval_steps": 179,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "entropy": 1.5857114374637604,
14
+ "epoch": 0.005608524957936063,
15
+ "grad_norm": 0.6155414581298828,
16
  "learning_rate": 0.00016,
17
+ "loss": 1.9706,
18
+ "mean_token_accuracy": 0.6006834208965302,
19
+ "num_tokens": 8800.0,
20
  "step": 5
21
  },
22
  {
23
+ "entropy": 1.9297356605529785,
24
+ "epoch": 0.011217049915872126,
25
+ "grad_norm": 0.7395716905593872,
26
  "learning_rate": 0.00019909808342728297,
27
+ "loss": 1.7721,
28
+ "mean_token_accuracy": 0.6421857982873916,
29
+ "num_tokens": 17002.0,
30
  "step": 10
31
  },
32
  {
33
+ "entropy": 1.619718110561371,
34
+ "epoch": 0.01682557487380819,
35
+ "grad_norm": 0.6860098838806152,
36
  "learning_rate": 0.0001979706877113867,
37
+ "loss": 1.4967,
38
+ "mean_token_accuracy": 0.6729415714740753,
39
+ "num_tokens": 25283.0,
40
  "step": 15
41
  },
42
  {
43
+ "entropy": 1.5023219525814056,
44
+ "epoch": 0.022434099831744252,
45
+ "grad_norm": 0.5842112898826599,
46
  "learning_rate": 0.00019684329199549043,
47
+ "loss": 1.4969,
48
+ "mean_token_accuracy": 0.6710207283496856,
49
+ "num_tokens": 33505.0,
50
  "step": 20
51
  },
52
  {
53
+ "entropy": 1.6429471731185914,
54
+ "epoch": 0.028042624789680313,
55
+ "grad_norm": 0.5911830067634583,
56
  "learning_rate": 0.00019571589627959414,
57
+ "loss": 1.5174,
58
+ "mean_token_accuracy": 0.6597635358572006,
59
+ "num_tokens": 43397.0,
60
  "step": 25
61
  },
62
  {
63
+ "entropy": 1.6084780812263488,
64
+ "epoch": 0.03365114974761638,
65
+ "grad_norm": 0.7406187653541565,
66
  "learning_rate": 0.00019458850056369787,
67
+ "loss": 1.4757,
68
+ "mean_token_accuracy": 0.6691249191761017,
69
+ "num_tokens": 52556.0,
70
  "step": 30
71
  },
72
  {
73
+ "entropy": 1.4238544702529907,
74
+ "epoch": 0.03925967470555244,
75
+ "grad_norm": 0.611213743686676,
76
  "learning_rate": 0.00019346110484780158,
77
+ "loss": 1.4085,
78
+ "mean_token_accuracy": 0.692791685461998,
79
+ "num_tokens": 61579.0,
80
  "step": 35
81
  },
82
  {
83
+ "entropy": 1.3825733065605164,
84
+ "epoch": 0.044868199663488505,
85
+ "grad_norm": 0.6608020663261414,
86
  "learning_rate": 0.0001923337091319053,
87
+ "loss": 1.3955,
88
+ "mean_token_accuracy": 0.6937497437000275,
89
+ "num_tokens": 68479.0,
90
  "step": 40
91
  },
92
  {
93
+ "entropy": 1.4096888184547425,
94
+ "epoch": 0.050476724621424565,
95
+ "grad_norm": 0.5221259593963623,
96
  "learning_rate": 0.00019120631341600902,
97
+ "loss": 1.2979,
98
+ "mean_token_accuracy": 0.6925529271364212,
99
+ "num_tokens": 77911.0,
100
  "step": 45
101
  },
102
  {
103
+ "entropy": 1.3391252905130386,
104
+ "epoch": 0.056085249579360626,
105
+ "grad_norm": 0.6178808212280273,
106
  "learning_rate": 0.00019007891770011275,
107
+ "loss": 1.3092,
108
+ "mean_token_accuracy": 0.704131829738617,
109
+ "num_tokens": 86382.0,
110
  "step": 50
111
  },
112
  {
113
+ "entropy": 1.3084194093942643,
114
+ "epoch": 0.06169377453729669,
115
+ "grad_norm": 0.570563554763794,
116
  "learning_rate": 0.00018895152198421646,
117
+ "loss": 1.228,
118
+ "mean_token_accuracy": 0.6969290852546692,
119
+ "num_tokens": 94306.0,
120
  "step": 55
121
  },
122
  {
123
+ "entropy": 1.418030035495758,
124
+ "epoch": 0.06730229949523275,
125
+ "grad_norm": 0.6073914766311646,
126
  "learning_rate": 0.0001878241262683202,
127
+ "loss": 1.3252,
128
+ "mean_token_accuracy": 0.6798107504844666,
129
+ "num_tokens": 103567.0,
130
  "step": 60
131
  },
132
  {
133
+ "entropy": 1.5420262813568115,
134
+ "epoch": 0.07291082445316882,
135
+ "grad_norm": 0.4949992001056671,
136
  "learning_rate": 0.00018669673055242392,
137
+ "loss": 1.4262,
138
+ "mean_token_accuracy": 0.6678558409214019,
139
+ "num_tokens": 113923.0,
140
  "step": 65
141
  },
142
  {
143
+ "entropy": 1.3928685992956162,
144
+ "epoch": 0.07851934941110487,
145
+ "grad_norm": 0.5758721828460693,
146
  "learning_rate": 0.00018556933483652763,
147
+ "loss": 1.3512,
148
+ "mean_token_accuracy": 0.6777496755123138,
149
+ "num_tokens": 126199.0,
150
  "step": 70
151
  },
152
  {
153
+ "entropy": 1.336549162864685,
154
+ "epoch": 0.08412787436904094,
155
+ "grad_norm": 0.678063154220581,
156
  "learning_rate": 0.00018444193912063134,
157
+ "loss": 1.2387,
158
+ "mean_token_accuracy": 0.6992105931043625,
159
+ "num_tokens": 135737.0,
160
  "step": 75
161
  },
162
  {
163
+ "entropy": 1.5632237881422042,
164
+ "epoch": 0.08973639932697701,
165
+ "grad_norm": 0.5325204730033875,
166
  "learning_rate": 0.00018331454340473507,
167
+ "loss": 1.479,
168
+ "mean_token_accuracy": 0.6516371637582778,
169
+ "num_tokens": 146847.0,
170
  "step": 80
171
  },
172
  {
173
+ "entropy": 1.4212194442749024,
174
+ "epoch": 0.09534492428491306,
175
+ "grad_norm": 0.8020451664924622,
176
  "learning_rate": 0.0001821871476888388,
177
+ "loss": 1.3261,
178
+ "mean_token_accuracy": 0.6801748961210251,
179
+ "num_tokens": 154755.0,
180
  "step": 85
181
  },
182
  {
183
+ "entropy": 1.2850608110427857,
184
+ "epoch": 0.10095344924284913,
185
+ "grad_norm": 0.9955788254737854,
186
  "learning_rate": 0.0001810597519729425,
187
+ "loss": 1.1832,
188
+ "mean_token_accuracy": 0.7192482769489288,
189
+ "num_tokens": 162857.0,
190
  "step": 90
191
  },
192
  {
193
+ "entropy": 1.24569151699543,
194
+ "epoch": 0.1065619742007852,
195
+ "grad_norm": 0.6132731437683105,
196
  "learning_rate": 0.00017993235625704624,
197
+ "loss": 1.1905,
198
+ "mean_token_accuracy": 0.7139606773853302,
199
+ "num_tokens": 171381.0,
200
  "step": 95
201
  },
202
  {
203
+ "entropy": 1.3500551611185074,
204
+ "epoch": 0.11217049915872125,
205
+ "grad_norm": 0.604263186454773,
206
  "learning_rate": 0.00017880496054114995,
207
+ "loss": 1.3683,
208
+ "mean_token_accuracy": 0.6883647471666337,
209
+ "num_tokens": 179672.0,
210
  "step": 100
211
  }
212
  ],
 
227
  "attributes": {}
228
  }
229
  },
230
+ "total_flos": 8135764893302784.0,
231
  "train_batch_size": 1,
232
  "trial_name": null,
233
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79d41fa02013525705cb7a82d4f608a53737fdbc7baa1d76305c242ebd4e870e
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75cf331c13a33e1598e45e9013486c4013f3af1f377da7304e21a0d1c22d72cb
3
  size 5816