kuma-rtin commited on
Commit
c128bb2
·
verified ·
1 Parent(s): ff0b7be

Model save

Browse files
README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: openai/whisper-base
5
+ tags:
6
+ - base_model:adapter:openai/whisper-base
7
+ - lora
8
+ - transformers
9
+ model-index:
10
+ - name: whisper_ft
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # whisper_ft
18
+
19
+ This model is a fine-tuned version of [openai/whisper-base](https://huggingface.co/openai/whisper-base) on an unknown dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 0.0002
39
+ - train_batch_size: 8
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_steps: 100
45
+ - num_epochs: 10
46
+ - mixed_precision_training: Native AMP
47
+
48
+ ### Framework versions
49
+
50
+ - PEFT 0.17.1
51
+ - Transformers 4.56.1
52
+ - Pytorch 2.8.0+cu128
53
+ - Datasets 3.6.0
54
+ - Tokenizers 0.22.0
adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "WhisperForConditionalGeneration",
5
+ "parent_library": "transformers.models.whisper.modeling_whisper"
6
+ },
7
+ "base_model_name_or_path": "openai/whisper-base",
8
+ "bias": "none",
9
+ "corda_config": null,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": "loftq",
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {
19
+ "loftq_bits": 8,
20
+ "loftq_iter": 1
21
+ },
22
+ "lora_alpha": 64,
23
+ "lora_bias": false,
24
+ "lora_dropout": 0.05,
25
+ "megatron_config": null,
26
+ "megatron_core": "megatron.core",
27
+ "modules_to_save": null,
28
+ "peft_type": "LORA",
29
+ "qalora_group_size": 16,
30
+ "r": 32,
31
+ "rank_pattern": {},
32
+ "revision": null,
33
+ "target_modules": [
34
+ "q_proj",
35
+ "v_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": null,
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be761a32416e20ef24cdf5bfeb8f24db75d129c5c1078e76d3590ec5ddca8b3
3
+ size 4728680
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,1957 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 47000,
3
+ "best_metric": 0.41246128940763466,
4
+ "best_model_checkpoint": "/vol/whisper_ft/checkpoint-47000",
5
+ "epoch": 2.129264073104733,
6
+ "eval_steps": 500,
7
+ "global_step": 48000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011089917047420485,
14
+ "grad_norm": 1.1934534311294556,
15
+ "learning_rate": 0.00019999978422352354,
16
+ "loss": 1.6958,
17
+ "step": 250
18
+ },
19
+ {
20
+ "epoch": 0.02217983409484097,
21
+ "grad_norm": 1.2334566116333008,
22
+ "learning_rate": 0.00019999845269336546,
23
+ "loss": 1.3806,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 0.02217983409484097,
28
+ "eval_bleu": 0.26383481798460395,
29
+ "step": 500,
30
+ "swisstext21_eval_bleu": 0.24588728491056722
31
+ },
32
+ {
33
+ "epoch": 0.03326975114226145,
34
+ "grad_norm": 1.3390406370162964,
35
+ "learning_rate": 0.00019999590627988633,
36
+ "loss": 1.3021,
37
+ "step": 750
38
+ },
39
+ {
40
+ "epoch": 0.04435966818968194,
41
+ "grad_norm": 1.4407659769058228,
42
+ "learning_rate": 0.00019999214501402262,
43
+ "loss": 1.2618,
44
+ "step": 1000
45
+ },
46
+ {
47
+ "epoch": 0.04435966818968194,
48
+ "eval_bleu": 0.29376917116497847,
49
+ "step": 1000,
50
+ "swisstext21_eval_bleu": 0.2559340674197411
51
+ },
52
+ {
53
+ "epoch": 0.055449585237102424,
54
+ "grad_norm": 1.732470154762268,
55
+ "learning_rate": 0.00019998716894147002,
56
+ "loss": 1.2498,
57
+ "step": 1250
58
+ },
59
+ {
60
+ "epoch": 0.0665395022845229,
61
+ "grad_norm": 1.6147369146347046,
62
+ "learning_rate": 0.00019998097812268297,
63
+ "loss": 1.2098,
64
+ "step": 1500
65
+ },
66
+ {
67
+ "epoch": 0.0665395022845229,
68
+ "eval_bleu": 0.30711942134345976,
69
+ "step": 1500,
70
+ "swisstext21_eval_bleu": 0.269003899480841
71
+ },
72
+ {
73
+ "epoch": 0.0776294193319434,
74
+ "grad_norm": 2.459263563156128,
75
+ "learning_rate": 0.00019997357263287384,
76
+ "loss": 1.2022,
77
+ "step": 1750
78
+ },
79
+ {
80
+ "epoch": 0.08871933637936388,
81
+ "grad_norm": 2.6925580501556396,
82
+ "learning_rate": 0.00019996495256201206,
83
+ "loss": 1.1799,
84
+ "step": 2000
85
+ },
86
+ {
87
+ "epoch": 0.08871933637936388,
88
+ "eval_bleu": 0.30569528721761224,
89
+ "step": 2000,
90
+ "swisstext21_eval_bleu": 0.27000582668120887
91
+ },
92
+ {
93
+ "epoch": 0.09980925342678437,
94
+ "grad_norm": 1.5046218633651733,
95
+ "learning_rate": 0.0001999551180148231,
96
+ "loss": 1.1461,
97
+ "step": 2250
98
+ },
99
+ {
100
+ "epoch": 0.11089917047420485,
101
+ "grad_norm": 1.556638479232788,
102
+ "learning_rate": 0.00019994406911078705,
103
+ "loss": 1.1438,
104
+ "step": 2500
105
+ },
106
+ {
107
+ "epoch": 0.11089917047420485,
108
+ "eval_bleu": 0.3315598406945841,
109
+ "step": 2500,
110
+ "swisstext21_eval_bleu": 0.2808034890747164
111
+ },
112
+ {
113
+ "epoch": 0.12198908752162534,
114
+ "grad_norm": 1.793188452720642,
115
+ "learning_rate": 0.00019993180598413727,
116
+ "loss": 1.1377,
117
+ "step": 2750
118
+ },
119
+ {
120
+ "epoch": 0.1330790045690458,
121
+ "grad_norm": 1.8901283740997314,
122
+ "learning_rate": 0.00019991832878385877,
123
+ "loss": 1.1102,
124
+ "step": 3000
125
+ },
126
+ {
127
+ "epoch": 0.1330790045690458,
128
+ "eval_bleu": 0.3286964300064891,
129
+ "step": 3000,
130
+ "swisstext21_eval_bleu": 0.2869587788387092
131
+ },
132
+ {
133
+ "epoch": 0.1441689216164663,
134
+ "grad_norm": 2.159233808517456,
135
+ "learning_rate": 0.00019990363767368634,
136
+ "loss": 1.0988,
137
+ "step": 3250
138
+ },
139
+ {
140
+ "epoch": 0.1552588386638868,
141
+ "grad_norm": 2.3724594116210938,
142
+ "learning_rate": 0.00019988773283210258,
143
+ "loss": 1.1046,
144
+ "step": 3500
145
+ },
146
+ {
147
+ "epoch": 0.1552588386638868,
148
+ "eval_bleu": 0.33459475803966965,
149
+ "step": 3500,
150
+ "swisstext21_eval_bleu": 0.29295814557090094
151
+ },
152
+ {
153
+ "epoch": 0.16634875571130728,
154
+ "grad_norm": 1.7678097486495972,
155
+ "learning_rate": 0.00019987061445233577,
156
+ "loss": 1.1092,
157
+ "step": 3750
158
+ },
159
+ {
160
+ "epoch": 0.17743867275872777,
161
+ "grad_norm": 2.1687381267547607,
162
+ "learning_rate": 0.00019985228274235742,
163
+ "loss": 1.0759,
164
+ "step": 4000
165
+ },
166
+ {
167
+ "epoch": 0.17743867275872777,
168
+ "eval_bleu": 0.34760286238068167,
169
+ "step": 4000,
170
+ "swisstext21_eval_bleu": 0.2908780491365115
171
+ },
172
+ {
173
+ "epoch": 0.18852858980614826,
174
+ "grad_norm": 1.9985744953155518,
175
+ "learning_rate": 0.00019983273792487989,
176
+ "loss": 1.0808,
177
+ "step": 4250
178
+ },
179
+ {
180
+ "epoch": 0.19961850685356874,
181
+ "grad_norm": 1.8689627647399902,
182
+ "learning_rate": 0.0001998119802373536,
183
+ "loss": 1.0705,
184
+ "step": 4500
185
+ },
186
+ {
187
+ "epoch": 0.19961850685356874,
188
+ "eval_bleu": 0.34319683823977476,
189
+ "step": 4500,
190
+ "swisstext21_eval_bleu": 0.28856541001986674
191
+ },
192
+ {
193
+ "epoch": 0.21070842390098923,
194
+ "grad_norm": 2.0760931968688965,
195
+ "learning_rate": 0.00019979000993196412,
196
+ "loss": 1.0473,
197
+ "step": 4750
198
+ },
199
+ {
200
+ "epoch": 0.2217983409484097,
201
+ "grad_norm": 1.9144140481948853,
202
+ "learning_rate": 0.00019976682727562914,
203
+ "loss": 1.0731,
204
+ "step": 5000
205
+ },
206
+ {
207
+ "epoch": 0.2217983409484097,
208
+ "eval_bleu": 0.3442873001843364,
209
+ "step": 5000,
210
+ "swisstext21_eval_bleu": 0.28945781960876255
211
+ },
212
+ {
213
+ "epoch": 0.23288825799583018,
214
+ "grad_norm": 2.0428848266601562,
215
+ "learning_rate": 0.00019974243254999524,
216
+ "loss": 1.0473,
217
+ "step": 5250
218
+ },
219
+ {
220
+ "epoch": 0.24397817504325067,
221
+ "grad_norm": 2.340768814086914,
222
+ "learning_rate": 0.0001997168260514345,
223
+ "loss": 1.0434,
224
+ "step": 5500
225
+ },
226
+ {
227
+ "epoch": 0.24397817504325067,
228
+ "eval_bleu": 0.3553012116107563,
229
+ "step": 5500,
230
+ "swisstext21_eval_bleu": 0.2917851023260039
231
+ },
232
+ {
233
+ "epoch": 0.25506809209067116,
234
+ "grad_norm": 2.007636785507202,
235
+ "learning_rate": 0.00019969000809104078,
236
+ "loss": 1.0606,
237
+ "step": 5750
238
+ },
239
+ {
240
+ "epoch": 0.2661580091380916,
241
+ "grad_norm": 1.9096291065216064,
242
+ "learning_rate": 0.00019966197899462607,
243
+ "loss": 1.0455,
244
+ "step": 6000
245
+ },
246
+ {
247
+ "epoch": 0.2661580091380916,
248
+ "eval_bleu": 0.35127993097271876,
249
+ "step": 6000,
250
+ "swisstext21_eval_bleu": 0.2929261661819239
251
+ },
252
+ {
253
+ "epoch": 0.27724792618551214,
254
+ "grad_norm": 1.8328126668930054,
255
+ "learning_rate": 0.00019963273910271644,
256
+ "loss": 1.0379,
257
+ "step": 6250
258
+ },
259
+ {
260
+ "epoch": 0.2883378432329326,
261
+ "grad_norm": 2.2776389122009277,
262
+ "learning_rate": 0.00019960228877054798,
263
+ "loss": 1.0634,
264
+ "step": 6500
265
+ },
266
+ {
267
+ "epoch": 0.2883378432329326,
268
+ "eval_bleu": 0.3593740208185027,
269
+ "step": 6500,
270
+ "swisstext21_eval_bleu": 0.2988206584421688
271
+ },
272
+ {
273
+ "epoch": 0.2994277602803531,
274
+ "grad_norm": 2.0759193897247314,
275
+ "learning_rate": 0.0001995706283680624,
276
+ "loss": 1.0503,
277
+ "step": 6750
278
+ },
279
+ {
280
+ "epoch": 0.3105176773277736,
281
+ "grad_norm": 2.1185827255249023,
282
+ "learning_rate": 0.0001995377582799026,
283
+ "loss": 1.034,
284
+ "step": 7000
285
+ },
286
+ {
287
+ "epoch": 0.3105176773277736,
288
+ "eval_bleu": 0.3624353947369105,
289
+ "step": 7000,
290
+ "swisstext21_eval_bleu": 0.3041434672016612
291
+ },
292
+ {
293
+ "epoch": 0.3216075943751941,
294
+ "grad_norm": 2.1475605964660645,
295
+ "learning_rate": 0.00019950367890540798,
296
+ "loss": 0.9958,
297
+ "step": 7250
298
+ },
299
+ {
300
+ "epoch": 0.33269751142261456,
301
+ "grad_norm": 2.6123814582824707,
302
+ "learning_rate": 0.00019946839065860956,
303
+ "loss": 0.999,
304
+ "step": 7500
305
+ },
306
+ {
307
+ "epoch": 0.33269751142261456,
308
+ "eval_bleu": 0.36432220231953555,
309
+ "step": 7500,
310
+ "swisstext21_eval_bleu": 0.30350836299630507
311
+ },
312
+ {
313
+ "epoch": 0.343787428470035,
314
+ "grad_norm": 1.8161005973815918,
315
+ "learning_rate": 0.00019943189396822507,
316
+ "loss": 0.9979,
317
+ "step": 7750
318
+ },
319
+ {
320
+ "epoch": 0.35487734551745553,
321
+ "grad_norm": 2.436647653579712,
322
+ "learning_rate": 0.00019939418927765348,
323
+ "loss": 1.0018,
324
+ "step": 8000
325
+ },
326
+ {
327
+ "epoch": 0.35487734551745553,
328
+ "eval_bleu": 0.35839847923496915,
329
+ "step": 8000,
330
+ "swisstext21_eval_bleu": 0.3043209488144041
331
+ },
332
+ {
333
+ "epoch": 0.365967262564876,
334
+ "grad_norm": 2.308076858520508,
335
+ "learning_rate": 0.00019935527704496994,
336
+ "loss": 0.9902,
337
+ "step": 8250
338
+ },
339
+ {
340
+ "epoch": 0.3770571796122965,
341
+ "grad_norm": 2.606699228286743,
342
+ "learning_rate": 0.00019931515774291998,
343
+ "loss": 0.9822,
344
+ "step": 8500
345
+ },
346
+ {
347
+ "epoch": 0.3770571796122965,
348
+ "eval_bleu": 0.36044795502424115,
349
+ "step": 8500,
350
+ "swisstext21_eval_bleu": 0.3064253970521744
351
+ },
352
+ {
353
+ "epoch": 0.388147096659717,
354
+ "grad_norm": 2.556300163269043,
355
+ "learning_rate": 0.0001992738318589138,
356
+ "loss": 1.0149,
357
+ "step": 8750
358
+ },
359
+ {
360
+ "epoch": 0.3992370137071375,
361
+ "grad_norm": 2.8726866245269775,
362
+ "learning_rate": 0.00019923129989502054,
363
+ "loss": 0.9838,
364
+ "step": 9000
365
+ },
366
+ {
367
+ "epoch": 0.3992370137071375,
368
+ "eval_bleu": 0.37244848523119756,
369
+ "step": 9000,
370
+ "swisstext21_eval_bleu": 0.3129379937656933
371
+ },
372
+ {
373
+ "epoch": 0.41032693075455795,
374
+ "grad_norm": 2.598942279815674,
375
+ "learning_rate": 0.00019918756236796185,
376
+ "loss": 0.9566,
377
+ "step": 9250
378
+ },
379
+ {
380
+ "epoch": 0.42141684780197847,
381
+ "grad_norm": 2.4860658645629883,
382
+ "learning_rate": 0.0001991426198091059,
383
+ "loss": 0.9935,
384
+ "step": 9500
385
+ },
386
+ {
387
+ "epoch": 0.42141684780197847,
388
+ "eval_bleu": 0.3737906725916547,
389
+ "step": 9500,
390
+ "swisstext21_eval_bleu": 0.3083854991004289
391
+ },
392
+ {
393
+ "epoch": 0.43250676484939893,
394
+ "grad_norm": 2.1095190048217773,
395
+ "learning_rate": 0.00019909647276446078,
396
+ "loss": 0.9678,
397
+ "step": 9750
398
+ },
399
+ {
400
+ "epoch": 0.4435966818968194,
401
+ "grad_norm": 2.4634251594543457,
402
+ "learning_rate": 0.00019904912179466794,
403
+ "loss": 0.9953,
404
+ "step": 10000
405
+ },
406
+ {
407
+ "epoch": 0.4435966818968194,
408
+ "eval_bleu": 0.3665470193277728,
409
+ "step": 10000,
410
+ "swisstext21_eval_bleu": 0.3026636658490007
411
+ },
412
+ {
413
+ "epoch": 0.4546865989442399,
414
+ "grad_norm": 2.3925271034240723,
415
+ "learning_rate": 0.00019900056747499528,
416
+ "loss": 0.9665,
417
+ "step": 10250
418
+ },
419
+ {
420
+ "epoch": 0.46577651599166037,
421
+ "grad_norm": 2.091660261154175,
422
+ "learning_rate": 0.00019895081039533028,
423
+ "loss": 0.9742,
424
+ "step": 10500
425
+ },
426
+ {
427
+ "epoch": 0.46577651599166037,
428
+ "eval_bleu": 0.36861395481685405,
429
+ "step": 10500,
430
+ "swisstext21_eval_bleu": 0.3054061636591619
431
+ },
432
+ {
433
+ "epoch": 0.4768664330390809,
434
+ "grad_norm": 2.4169013500213623,
435
+ "learning_rate": 0.00019889985116017275,
436
+ "loss": 0.9737,
437
+ "step": 10750
438
+ },
439
+ {
440
+ "epoch": 0.48795635008650134,
441
+ "grad_norm": 2.641219139099121,
442
+ "learning_rate": 0.00019884769038862752,
443
+ "loss": 0.9644,
444
+ "step": 11000
445
+ },
446
+ {
447
+ "epoch": 0.48795635008650134,
448
+ "eval_bleu": 0.36434503459432904,
449
+ "step": 11000,
450
+ "swisstext21_eval_bleu": 0.29839491482098374
451
+ },
452
+ {
453
+ "epoch": 0.49904626713392186,
454
+ "grad_norm": 2.1267800331115723,
455
+ "learning_rate": 0.0001987943287143969,
456
+ "loss": 0.9495,
457
+ "step": 11250
458
+ },
459
+ {
460
+ "epoch": 0.5101361841813423,
461
+ "grad_norm": 2.882136344909668,
462
+ "learning_rate": 0.00019873976678577303,
463
+ "loss": 0.9463,
464
+ "step": 11500
465
+ },
466
+ {
467
+ "epoch": 0.5101361841813423,
468
+ "eval_bleu": 0.3740910429878766,
469
+ "step": 11500,
470
+ "swisstext21_eval_bleu": 0.3090743441007289
471
+ },
472
+ {
473
+ "epoch": 0.5212261012287628,
474
+ "grad_norm": 3.2668380737304688,
475
+ "learning_rate": 0.00019868400526562986,
476
+ "loss": 0.9392,
477
+ "step": 11750
478
+ },
479
+ {
480
+ "epoch": 0.5323160182761832,
481
+ "grad_norm": 2.5321474075317383,
482
+ "learning_rate": 0.00019862704483141538,
483
+ "loss": 0.9619,
484
+ "step": 12000
485
+ },
486
+ {
487
+ "epoch": 0.5323160182761832,
488
+ "eval_bleu": 0.36756283810138785,
489
+ "step": 12000,
490
+ "swisstext21_eval_bleu": 0.3048907871619846
491
+ },
492
+ {
493
+ "epoch": 0.5434059353236038,
494
+ "grad_norm": 2.021146535873413,
495
+ "learning_rate": 0.00019856888617514305,
496
+ "loss": 0.971,
497
+ "step": 12250
498
+ },
499
+ {
500
+ "epoch": 0.5544958523710243,
501
+ "grad_norm": 2.465663194656372,
502
+ "learning_rate": 0.00019850953000338364,
503
+ "loss": 0.9584,
504
+ "step": 12500
505
+ },
506
+ {
507
+ "epoch": 0.5544958523710243,
508
+ "eval_bleu": 0.3669124274171023,
509
+ "step": 12500,
510
+ "swisstext21_eval_bleu": 0.3044579034347087
511
+ },
512
+ {
513
+ "epoch": 0.5655857694184447,
514
+ "grad_norm": 2.527566432952881,
515
+ "learning_rate": 0.00019844897703725657,
516
+ "loss": 0.9479,
517
+ "step": 12750
518
+ },
519
+ {
520
+ "epoch": 0.5766756864658652,
521
+ "grad_norm": 2.43293833732605,
522
+ "learning_rate": 0.00019838722801242102,
523
+ "loss": 0.9627,
524
+ "step": 13000
525
+ },
526
+ {
527
+ "epoch": 0.5766756864658652,
528
+ "eval_bleu": 0.3896419547996463,
529
+ "step": 13000,
530
+ "swisstext21_eval_bleu": 0.3107247914347159
531
+ },
532
+ {
533
+ "epoch": 0.5877656035132858,
534
+ "grad_norm": 2.9251341819763184,
535
+ "learning_rate": 0.00019832428367906726,
536
+ "loss": 0.9336,
537
+ "step": 13250
538
+ },
539
+ {
540
+ "epoch": 0.5988555205607062,
541
+ "grad_norm": 1.883135199546814,
542
+ "learning_rate": 0.00019826014480190735,
543
+ "loss": 0.9602,
544
+ "step": 13500
545
+ },
546
+ {
547
+ "epoch": 0.5988555205607062,
548
+ "eval_bleu": 0.38262014029410524,
549
+ "step": 13500,
550
+ "swisstext21_eval_bleu": 0.3081525753078253
551
+ },
552
+ {
553
+ "epoch": 0.6099454376081267,
554
+ "grad_norm": 2.635322332382202,
555
+ "learning_rate": 0.00019819481216016581,
556
+ "loss": 0.9449,
557
+ "step": 13750
558
+ },
559
+ {
560
+ "epoch": 0.6210353546555472,
561
+ "grad_norm": 2.1079013347625732,
562
+ "learning_rate": 0.00019812828654757036,
563
+ "loss": 0.9389,
564
+ "step": 14000
565
+ },
566
+ {
567
+ "epoch": 0.6210353546555472,
568
+ "eval_bleu": 0.3855927266497669,
569
+ "step": 14000,
570
+ "swisstext21_eval_bleu": 0.30831521418263236
571
+ },
572
+ {
573
+ "epoch": 0.6321252717029676,
574
+ "grad_norm": 2.3937461376190186,
575
+ "learning_rate": 0.00019806056877234203,
576
+ "loss": 0.9186,
577
+ "step": 14250
578
+ },
579
+ {
580
+ "epoch": 0.6432151887503882,
581
+ "grad_norm": 2.3645012378692627,
582
+ "learning_rate": 0.0001979916596571855,
583
+ "loss": 0.9262,
584
+ "step": 14500
585
+ },
586
+ {
587
+ "epoch": 0.6432151887503882,
588
+ "eval_bleu": 0.38679790640795114,
589
+ "step": 14500,
590
+ "swisstext21_eval_bleu": 0.30320803115793465
591
+ },
592
+ {
593
+ "epoch": 0.6543051057978087,
594
+ "grad_norm": 2.6536011695861816,
595
+ "learning_rate": 0.00019792156003927908,
596
+ "loss": 0.9367,
597
+ "step": 14750
598
+ },
599
+ {
600
+ "epoch": 0.6653950228452291,
601
+ "grad_norm": 2.554568290710449,
602
+ "learning_rate": 0.0001978502707702645,
603
+ "loss": 0.9376,
604
+ "step": 15000
605
+ },
606
+ {
607
+ "epoch": 0.6653950228452291,
608
+ "eval_bleu": 0.38779759574648714,
609
+ "step": 15000,
610
+ "swisstext21_eval_bleu": 0.31106129606447
611
+ },
612
+ {
613
+ "epoch": 0.6764849398926496,
614
+ "grad_norm": 2.6350646018981934,
615
+ "learning_rate": 0.00019777779271623667,
616
+ "loss": 0.9491,
617
+ "step": 15250
618
+ },
619
+ {
620
+ "epoch": 0.68757485694007,
621
+ "grad_norm": 2.5811619758605957,
622
+ "learning_rate": 0.0001977041267577329,
623
+ "loss": 0.9336,
624
+ "step": 15500
625
+ },
626
+ {
627
+ "epoch": 0.68757485694007,
628
+ "eval_bleu": 0.3875376232865486,
629
+ "step": 15500,
630
+ "swisstext21_eval_bleu": 0.31103764077844936
631
+ },
632
+ {
633
+ "epoch": 0.6986647739874906,
634
+ "grad_norm": 2.780806064605713,
635
+ "learning_rate": 0.00019762927378972258,
636
+ "loss": 0.8984,
637
+ "step": 15750
638
+ },
639
+ {
640
+ "epoch": 0.7097546910349111,
641
+ "grad_norm": 2.4621236324310303,
642
+ "learning_rate": 0.00019755323472159594,
643
+ "loss": 0.9099,
644
+ "step": 16000
645
+ },
646
+ {
647
+ "epoch": 0.7097546910349111,
648
+ "eval_bleu": 0.38610772811380073,
649
+ "step": 16000,
650
+ "swisstext21_eval_bleu": 0.3148091200691657
651
+ },
652
+ {
653
+ "epoch": 0.7208446080823315,
654
+ "grad_norm": 2.343982219696045,
655
+ "learning_rate": 0.00019747601047715325,
656
+ "loss": 0.9183,
657
+ "step": 16250
658
+ },
659
+ {
660
+ "epoch": 0.731934525129752,
661
+ "grad_norm": 2.3189568519592285,
662
+ "learning_rate": 0.00019739760199459348,
663
+ "loss": 0.9384,
664
+ "step": 16500
665
+ },
666
+ {
667
+ "epoch": 0.731934525129752,
668
+ "eval_bleu": 0.39072795790688153,
669
+ "step": 16500,
670
+ "swisstext21_eval_bleu": 0.3164304311232893
671
+ },
672
+ {
673
+ "epoch": 0.7430244421771726,
674
+ "grad_norm": 3.279153823852539,
675
+ "learning_rate": 0.00019731801022650295,
676
+ "loss": 0.9353,
677
+ "step": 16750
678
+ },
679
+ {
680
+ "epoch": 0.754114359224593,
681
+ "grad_norm": 2.482011079788208,
682
+ "learning_rate": 0.0001972372361398438,
683
+ "loss": 0.8947,
684
+ "step": 17000
685
+ },
686
+ {
687
+ "epoch": 0.754114359224593,
688
+ "eval_bleu": 0.39613006097078557,
689
+ "step": 17000,
690
+ "swisstext21_eval_bleu": 0.3207937159426819
691
+ },
692
+ {
693
+ "epoch": 0.7652042762720135,
694
+ "grad_norm": 2.11147403717041,
695
+ "learning_rate": 0.000197155280715942,
696
+ "loss": 0.9199,
697
+ "step": 17250
698
+ },
699
+ {
700
+ "epoch": 0.776294193319434,
701
+ "grad_norm": 2.464308977127075,
702
+ "learning_rate": 0.00019707214495047584,
703
+ "loss": 0.9243,
704
+ "step": 17500
705
+ },
706
+ {
707
+ "epoch": 0.776294193319434,
708
+ "eval_bleu": 0.38742231043625974,
709
+ "step": 17500,
710
+ "swisstext21_eval_bleu": 0.30187657356550057
711
+ },
712
+ {
713
+ "epoch": 0.7873841103668544,
714
+ "grad_norm": 2.7994627952575684,
715
+ "learning_rate": 0.00019698782985346343,
716
+ "loss": 0.9019,
717
+ "step": 17750
718
+ },
719
+ {
720
+ "epoch": 0.798474027414275,
721
+ "grad_norm": 2.315702438354492,
722
+ "learning_rate": 0.0001969023364492507,
723
+ "loss": 0.9211,
724
+ "step": 18000
725
+ },
726
+ {
727
+ "epoch": 0.798474027414275,
728
+ "eval_bleu": 0.385437022847082,
729
+ "step": 18000,
730
+ "swisstext21_eval_bleu": 0.30319223545431473
731
+ },
732
+ {
733
+ "epoch": 0.8095639444616954,
734
+ "grad_norm": 2.745156764984131,
735
+ "learning_rate": 0.00019681566577649882,
736
+ "loss": 0.9317,
737
+ "step": 18250
738
+ },
739
+ {
740
+ "epoch": 0.8206538615091159,
741
+ "grad_norm": 2.8551061153411865,
742
+ "learning_rate": 0.00019672781888817166,
743
+ "loss": 0.913,
744
+ "step": 18500
745
+ },
746
+ {
747
+ "epoch": 0.8206538615091159,
748
+ "eval_bleu": 0.3928160809300483,
749
+ "step": 18500,
750
+ "swisstext21_eval_bleu": 0.3170691369085918
751
+ },
752
+ {
753
+ "epoch": 0.8317437785565364,
754
+ "grad_norm": 2.358586072921753,
755
+ "learning_rate": 0.00019663879685152287,
756
+ "loss": 0.8985,
757
+ "step": 18750
758
+ },
759
+ {
760
+ "epoch": 0.8428336956039569,
761
+ "grad_norm": 2.516564130783081,
762
+ "learning_rate": 0.0001965486007480831,
763
+ "loss": 0.9167,
764
+ "step": 19000
765
+ },
766
+ {
767
+ "epoch": 0.8428336956039569,
768
+ "eval_bleu": 0.4001014194326336,
769
+ "step": 19000,
770
+ "swisstext21_eval_bleu": 0.3163929503602198
771
+ },
772
+ {
773
+ "epoch": 0.8539236126513774,
774
+ "grad_norm": 3.280510902404785,
775
+ "learning_rate": 0.0001964572316736467,
776
+ "loss": 0.887,
777
+ "step": 19250
778
+ },
779
+ {
780
+ "epoch": 0.8650135296987979,
781
+ "grad_norm": 2.5148253440856934,
782
+ "learning_rate": 0.00019636469073825853,
783
+ "loss": 0.8978,
784
+ "step": 19500
785
+ },
786
+ {
787
+ "epoch": 0.8650135296987979,
788
+ "eval_bleu": 0.3898774974713507,
789
+ "step": 19500,
790
+ "swisstext21_eval_bleu": 0.30876470114528226
791
+ },
792
+ {
793
+ "epoch": 0.8761034467462183,
794
+ "grad_norm": 2.336972951889038,
795
+ "learning_rate": 0.00019627097906620034,
796
+ "loss": 0.9011,
797
+ "step": 19750
798
+ },
799
+ {
800
+ "epoch": 0.8871933637936388,
801
+ "grad_norm": 2.353951930999756,
802
+ "learning_rate": 0.00019617609779597724,
803
+ "loss": 0.8937,
804
+ "step": 20000
805
+ },
806
+ {
807
+ "epoch": 0.8871933637936388,
808
+ "eval_bleu": 0.3855965532285254,
809
+ "step": 20000,
810
+ "swisstext21_eval_bleu": 0.3116831772809819
811
+ },
812
+ {
813
+ "epoch": 0.8982832808410594,
814
+ "grad_norm": 2.4216506481170654,
815
+ "learning_rate": 0.00019608004808030377,
816
+ "loss": 0.9127,
817
+ "step": 20250
818
+ },
819
+ {
820
+ "epoch": 0.9093731978884798,
821
+ "grad_norm": 2.7430005073547363,
822
+ "learning_rate": 0.00019598283108608996,
823
+ "loss": 0.8907,
824
+ "step": 20500
825
+ },
826
+ {
827
+ "epoch": 0.9093731978884798,
828
+ "eval_bleu": 0.38969433786646573,
829
+ "step": 20500,
830
+ "swisstext21_eval_bleu": 0.30565766059011035
831
+ },
832
+ {
833
+ "epoch": 0.9204631149359003,
834
+ "grad_norm": 2.8106558322906494,
835
+ "learning_rate": 0.00019588444799442718,
836
+ "loss": 0.8992,
837
+ "step": 20750
838
+ },
839
+ {
840
+ "epoch": 0.9315530319833207,
841
+ "grad_norm": 3.0954487323760986,
842
+ "learning_rate": 0.00019578490000057364,
843
+ "loss": 0.8886,
844
+ "step": 21000
845
+ },
846
+ {
847
+ "epoch": 0.9315530319833207,
848
+ "eval_bleu": 0.39285462338175686,
849
+ "step": 21000,
850
+ "swisstext21_eval_bleu": 0.30451958390372663
851
+ },
852
+ {
853
+ "epoch": 0.9426429490307412,
854
+ "grad_norm": 3.1268038749694824,
855
+ "learning_rate": 0.00019568418831394008,
856
+ "loss": 0.8599,
857
+ "step": 21250
858
+ },
859
+ {
860
+ "epoch": 0.9537328660781618,
861
+ "grad_norm": 3.3307135105133057,
862
+ "learning_rate": 0.00019558231415807485,
863
+ "loss": 0.8636,
864
+ "step": 21500
865
+ },
866
+ {
867
+ "epoch": 0.9537328660781618,
868
+ "eval_bleu": 0.3950705551480444,
869
+ "step": 21500,
870
+ "swisstext21_eval_bleu": 0.31237385318867816
871
+ },
872
+ {
873
+ "epoch": 0.9648227831255822,
874
+ "grad_norm": 2.284541606903076,
875
+ "learning_rate": 0.00019547927877064928,
876
+ "loss": 0.8719,
877
+ "step": 21750
878
+ },
879
+ {
880
+ "epoch": 0.9759127001730027,
881
+ "grad_norm": 2.83581805229187,
882
+ "learning_rate": 0.00019537508340344246,
883
+ "loss": 0.9002,
884
+ "step": 22000
885
+ },
886
+ {
887
+ "epoch": 0.9759127001730027,
888
+ "eval_bleu": 0.3921707237395287,
889
+ "step": 22000,
890
+ "swisstext21_eval_bleu": 0.3145926745569422
891
+ },
892
+ {
893
+ "epoch": 0.9870026172204232,
894
+ "grad_norm": 2.0217525959014893,
895
+ "learning_rate": 0.00019526972932232611,
896
+ "loss": 0.8814,
897
+ "step": 22250
898
+ },
899
+ {
900
+ "epoch": 0.9980925342678437,
901
+ "grad_norm": 2.3832855224609375,
902
+ "learning_rate": 0.0001951632178072492,
903
+ "loss": 0.889,
904
+ "step": 22500
905
+ },
906
+ {
907
+ "epoch": 0.9980925342678437,
908
+ "eval_bleu": 0.38714935228629954,
909
+ "step": 22500,
910
+ "swisstext21_eval_bleu": 0.30922215825464777
911
+ },
912
+ {
913
+ "epoch": 1.009182451315264,
914
+ "grad_norm": 2.6860873699188232,
915
+ "learning_rate": 0.00019505555015222238,
916
+ "loss": 0.8236,
917
+ "step": 22750
918
+ },
919
+ {
920
+ "epoch": 1.0202723683626846,
921
+ "grad_norm": 2.8335115909576416,
922
+ "learning_rate": 0.0001949467276653023,
923
+ "loss": 0.8179,
924
+ "step": 23000
925
+ },
926
+ {
927
+ "epoch": 1.0202723683626846,
928
+ "eval_bleu": 0.38838724982256007,
929
+ "step": 23000,
930
+ "swisstext21_eval_bleu": 0.31582029041010873
931
+ },
932
+ {
933
+ "epoch": 1.0313622854101052,
934
+ "grad_norm": 3.070591449737549,
935
+ "learning_rate": 0.0001948367516685756,
936
+ "loss": 0.859,
937
+ "step": 23250
938
+ },
939
+ {
940
+ "epoch": 1.0424522024575256,
941
+ "grad_norm": 2.3931617736816406,
942
+ "learning_rate": 0.00019472562349814304,
943
+ "loss": 0.8491,
944
+ "step": 23500
945
+ },
946
+ {
947
+ "epoch": 1.0424522024575256,
948
+ "eval_bleu": 0.38063312076817646,
949
+ "step": 23500,
950
+ "swisstext21_eval_bleu": 0.3102516351199856
951
+ },
952
+ {
953
+ "epoch": 1.0535421195049461,
954
+ "grad_norm": 2.414862632751465,
955
+ "learning_rate": 0.00019461334450410314,
956
+ "loss": 0.8583,
957
+ "step": 23750
958
+ },
959
+ {
960
+ "epoch": 1.0646320365523665,
961
+ "grad_norm": 2.6890299320220947,
962
+ "learning_rate": 0.0001944999160505357,
963
+ "loss": 0.8407,
964
+ "step": 24000
965
+ },
966
+ {
967
+ "epoch": 1.0646320365523665,
968
+ "eval_bleu": 0.3982753114306131,
969
+ "step": 24000,
970
+ "swisstext21_eval_bleu": 0.3128261642735386
971
+ },
972
+ {
973
+ "epoch": 1.075721953599787,
974
+ "grad_norm": 2.8066024780273438,
975
+ "learning_rate": 0.00019438533951548546,
976
+ "loss": 0.8521,
977
+ "step": 24250
978
+ },
979
+ {
980
+ "epoch": 1.0868118706472076,
981
+ "grad_norm": 2.857349157333374,
982
+ "learning_rate": 0.00019426961629094515,
983
+ "loss": 0.8455,
984
+ "step": 24500
985
+ },
986
+ {
987
+ "epoch": 1.0868118706472076,
988
+ "eval_bleu": 0.38830354801904543,
989
+ "step": 24500,
990
+ "swisstext21_eval_bleu": 0.3040715120580202
991
+ },
992
+ {
993
+ "epoch": 1.097901787694628,
994
+ "grad_norm": 2.4639742374420166,
995
+ "learning_rate": 0.00019415274778283875,
996
+ "loss": 0.834,
997
+ "step": 24750
998
+ },
999
+ {
1000
+ "epoch": 1.1089917047420486,
1001
+ "grad_norm": 2.9022200107574463,
1002
+ "learning_rate": 0.00019403473541100414,
1003
+ "loss": 0.8401,
1004
+ "step": 25000
1005
+ },
1006
+ {
1007
+ "epoch": 1.1089917047420486,
1008
+ "eval_bleu": 0.3936498808339177,
1009
+ "step": 25000,
1010
+ "swisstext21_eval_bleu": 0.30703476528382184
1011
+ },
1012
+ {
1013
+ "epoch": 1.1200816217894691,
1014
+ "grad_norm": 2.418813943862915,
1015
+ "learning_rate": 0.00019391558060917615,
1016
+ "loss": 0.8549,
1017
+ "step": 25250
1018
+ },
1019
+ {
1020
+ "epoch": 1.1311715388368895,
1021
+ "grad_norm": 3.5514161586761475,
1022
+ "learning_rate": 0.00019379528482496905,
1023
+ "loss": 0.8373,
1024
+ "step": 25500
1025
+ },
1026
+ {
1027
+ "epoch": 1.1311715388368895,
1028
+ "eval_bleu": 0.3949608777582217,
1029
+ "step": 25500,
1030
+ "swisstext21_eval_bleu": 0.2991127273200187
1031
+ },
1032
+ {
1033
+ "epoch": 1.14226145588431,
1034
+ "grad_norm": 2.8393402099609375,
1035
+ "learning_rate": 0.0001936738495198588,
1036
+ "loss": 0.8305,
1037
+ "step": 25750
1038
+ },
1039
+ {
1040
+ "epoch": 1.1533513729317304,
1041
+ "grad_norm": 2.3504981994628906,
1042
+ "learning_rate": 0.00019355127616916558,
1043
+ "loss": 0.8391,
1044
+ "step": 26000
1045
+ },
1046
+ {
1047
+ "epoch": 1.1533513729317304,
1048
+ "eval_bleu": 0.39885045498118465,
1049
+ "step": 26000,
1050
+ "swisstext21_eval_bleu": 0.3008743660411916
1051
+ },
1052
+ {
1053
+ "epoch": 1.164441289979151,
1054
+ "grad_norm": 3.1513872146606445,
1055
+ "learning_rate": 0.00019342756626203553,
1056
+ "loss": 0.8151,
1057
+ "step": 26250
1058
+ },
1059
+ {
1060
+ "epoch": 1.1755312070265713,
1061
+ "grad_norm": 3.1882948875427246,
1062
+ "learning_rate": 0.00019330272130142298,
1063
+ "loss": 0.8451,
1064
+ "step": 26500
1065
+ },
1066
+ {
1067
+ "epoch": 1.1755312070265713,
1068
+ "eval_bleu": 0.39897615108571005,
1069
+ "step": 26500,
1070
+ "swisstext21_eval_bleu": 0.30551010767982617
1071
+ },
1072
+ {
1073
+ "epoch": 1.186621124073992,
1074
+ "grad_norm": 2.0949442386627197,
1075
+ "learning_rate": 0.00019317674280407203,
1076
+ "loss": 0.8558,
1077
+ "step": 26750
1078
+ },
1079
+ {
1080
+ "epoch": 1.1977110411214125,
1081
+ "grad_norm": 2.0875446796417236,
1082
+ "learning_rate": 0.00019304963230049808,
1083
+ "loss": 0.8596,
1084
+ "step": 27000
1085
+ },
1086
+ {
1087
+ "epoch": 1.1977110411214125,
1088
+ "eval_bleu": 0.3997092911184819,
1089
+ "step": 27000,
1090
+ "swisstext21_eval_bleu": 0.31326727973416657
1091
+ },
1092
+ {
1093
+ "epoch": 1.2088009581688328,
1094
+ "grad_norm": 2.8534743785858154,
1095
+ "learning_rate": 0.00019292139133496932,
1096
+ "loss": 0.8365,
1097
+ "step": 27250
1098
+ },
1099
+ {
1100
+ "epoch": 1.2198908752162534,
1101
+ "grad_norm": 3.247786521911621,
1102
+ "learning_rate": 0.00019279202146548795,
1103
+ "loss": 0.848,
1104
+ "step": 27500
1105
+ },
1106
+ {
1107
+ "epoch": 1.2198908752162534,
1108
+ "eval_bleu": 0.3939229666106703,
1109
+ "step": 27500,
1110
+ "swisstext21_eval_bleu": 0.31058011025928695
1111
+ },
1112
+ {
1113
+ "epoch": 1.230980792263674,
1114
+ "grad_norm": 2.812893867492676,
1115
+ "learning_rate": 0.00019266152426377134,
1116
+ "loss": 0.8296,
1117
+ "step": 27750
1118
+ },
1119
+ {
1120
+ "epoch": 1.2420707093110943,
1121
+ "grad_norm": 3.173842430114746,
1122
+ "learning_rate": 0.00019252990131523264,
1123
+ "loss": 0.843,
1124
+ "step": 28000
1125
+ },
1126
+ {
1127
+ "epoch": 1.2420707093110943,
1128
+ "eval_bleu": 0.39727487714618787,
1129
+ "step": 28000,
1130
+ "swisstext21_eval_bleu": 0.30997666108081895
1131
+ },
1132
+ {
1133
+ "epoch": 1.2531606263585149,
1134
+ "grad_norm": 2.501375436782837,
1135
+ "learning_rate": 0.00019239715421896195,
1136
+ "loss": 0.8296,
1137
+ "step": 28250
1138
+ },
1139
+ {
1140
+ "epoch": 1.2642505434059355,
1141
+ "grad_norm": 2.394441843032837,
1142
+ "learning_rate": 0.00019226328458770648,
1143
+ "loss": 0.8259,
1144
+ "step": 28500
1145
+ },
1146
+ {
1147
+ "epoch": 1.2642505434059355,
1148
+ "eval_bleu": 0.3951440149567978,
1149
+ "step": 28500,
1150
+ "swisstext21_eval_bleu": 0.3082165747905684
1151
+ },
1152
+ {
1153
+ "epoch": 1.2753404604533558,
1154
+ "grad_norm": 2.3463571071624756,
1155
+ "learning_rate": 0.00019212829404785127,
1156
+ "loss": 0.8372,
1157
+ "step": 28750
1158
+ },
1159
+ {
1160
+ "epoch": 1.2864303775007762,
1161
+ "grad_norm": 2.619544744491577,
1162
+ "learning_rate": 0.0001919921842393992,
1163
+ "loss": 0.8499,
1164
+ "step": 29000
1165
+ },
1166
+ {
1167
+ "epoch": 1.2864303775007762,
1168
+ "eval_bleu": 0.3997716708387876,
1169
+ "step": 29000,
1170
+ "swisstext21_eval_bleu": 0.31001057617223277
1171
+ },
1172
+ {
1173
+ "epoch": 1.2975202945481967,
1174
+ "grad_norm": 2.7703518867492676,
1175
+ "learning_rate": 0.00019185495681595121,
1176
+ "loss": 0.8255,
1177
+ "step": 29250
1178
+ },
1179
+ {
1180
+ "epoch": 1.3086102115956173,
1181
+ "grad_norm": 2.2880280017852783,
1182
+ "learning_rate": 0.00019171661344468625,
1183
+ "loss": 0.843,
1184
+ "step": 29500
1185
+ },
1186
+ {
1187
+ "epoch": 1.3086102115956173,
1188
+ "eval_bleu": 0.3990674495806238,
1189
+ "step": 29500,
1190
+ "swisstext21_eval_bleu": 0.3045854425199765
1191
+ },
1192
+ {
1193
+ "epoch": 1.3197001286430377,
1194
+ "grad_norm": 2.427645206451416,
1195
+ "learning_rate": 0.00019157715580634082,
1196
+ "loss": 0.8331,
1197
+ "step": 29750
1198
+ },
1199
+ {
1200
+ "epoch": 1.3307900456904582,
1201
+ "grad_norm": 2.514282703399658,
1202
+ "learning_rate": 0.00019143658559518873,
1203
+ "loss": 0.8303,
1204
+ "step": 30000
1205
+ },
1206
+ {
1207
+ "epoch": 1.3307900456904582,
1208
+ "eval_bleu": 0.39857650020404584,
1209
+ "step": 30000,
1210
+ "swisstext21_eval_bleu": 0.30778486046472064
1211
+ },
1212
+ {
1213
+ "epoch": 1.3418799627378788,
1214
+ "grad_norm": 2.3313047885894775,
1215
+ "learning_rate": 0.00019129490451902042,
1216
+ "loss": 0.7886,
1217
+ "step": 30250
1218
+ },
1219
+ {
1220
+ "epoch": 1.3529698797852991,
1221
+ "grad_norm": 2.250128746032715,
1222
+ "learning_rate": 0.00019115211429912237,
1223
+ "loss": 0.802,
1224
+ "step": 30500
1225
+ },
1226
+ {
1227
+ "epoch": 1.3529698797852991,
1228
+ "eval_bleu": 0.40068405768800824,
1229
+ "step": 30500,
1230
+ "swisstext21_eval_bleu": 0.3109150438405035
1231
+ },
1232
+ {
1233
+ "epoch": 1.3640597968327197,
1234
+ "grad_norm": 4.564970016479492,
1235
+ "learning_rate": 0.0001910082166702559,
1236
+ "loss": 0.7921,
1237
+ "step": 30750
1238
+ },
1239
+ {
1240
+ "epoch": 1.3751497138801403,
1241
+ "grad_norm": 3.340827703475952,
1242
+ "learning_rate": 0.00019086321338063642,
1243
+ "loss": 0.8016,
1244
+ "step": 31000
1245
+ },
1246
+ {
1247
+ "epoch": 1.3751497138801403,
1248
+ "eval_bleu": 0.3907764612746887,
1249
+ "step": 31000,
1250
+ "swisstext21_eval_bleu": 0.307514371169132
1251
+ },
1252
+ {
1253
+ "epoch": 1.3862396309275606,
1254
+ "grad_norm": 3.442969560623169,
1255
+ "learning_rate": 0.00019071710619191192,
1256
+ "loss": 0.8281,
1257
+ "step": 31250
1258
+ },
1259
+ {
1260
+ "epoch": 1.3973295479749812,
1261
+ "grad_norm": 3.0045955181121826,
1262
+ "learning_rate": 0.00019056989687914178,
1263
+ "loss": 0.82,
1264
+ "step": 31500
1265
+ },
1266
+ {
1267
+ "epoch": 1.3973295479749812,
1268
+ "eval_bleu": 0.4070777777979427,
1269
+ "step": 31500,
1270
+ "swisstext21_eval_bleu": 0.3133119978708261
1271
+ },
1272
+ {
1273
+ "epoch": 1.4084194650224016,
1274
+ "grad_norm": 2.9076082706451416,
1275
+ "learning_rate": 0.000190421587230775,
1276
+ "loss": 0.7708,
1277
+ "step": 31750
1278
+ },
1279
+ {
1280
+ "epoch": 1.4195093820698221,
1281
+ "grad_norm": 2.446475028991699,
1282
+ "learning_rate": 0.0001902721790486287,
1283
+ "loss": 0.8103,
1284
+ "step": 32000
1285
+ },
1286
+ {
1287
+ "epoch": 1.4195093820698221,
1288
+ "eval_bleu": 0.40894432730315045,
1289
+ "step": 32000,
1290
+ "swisstext21_eval_bleu": 0.31024336675207975
1291
+ },
1292
+ {
1293
+ "epoch": 1.4305992991172425,
1294
+ "grad_norm": 2.476989507675171,
1295
+ "learning_rate": 0.00019012167414786602,
1296
+ "loss": 0.8046,
1297
+ "step": 32250
1298
+ },
1299
+ {
1300
+ "epoch": 1.441689216164663,
1301
+ "grad_norm": 2.7591421604156494,
1302
+ "learning_rate": 0.00018997007435697416,
1303
+ "loss": 0.8286,
1304
+ "step": 32500
1305
+ },
1306
+ {
1307
+ "epoch": 1.441689216164663,
1308
+ "eval_bleu": 0.4030474309836763,
1309
+ "step": 32500,
1310
+ "swisstext21_eval_bleu": 0.3089112511767638
1311
+ },
1312
+ {
1313
+ "epoch": 1.4527791332120836,
1314
+ "grad_norm": 3.6329638957977295,
1315
+ "learning_rate": 0.00018981738151774223,
1316
+ "loss": 0.8007,
1317
+ "step": 32750
1318
+ },
1319
+ {
1320
+ "epoch": 1.463869050259504,
1321
+ "grad_norm": 3.2174901962280273,
1322
+ "learning_rate": 0.00018966359748523882,
1323
+ "loss": 0.8132,
1324
+ "step": 33000
1325
+ },
1326
+ {
1327
+ "epoch": 1.463869050259504,
1328
+ "eval_bleu": 0.40291037790441286,
1329
+ "step": 33000,
1330
+ "swisstext21_eval_bleu": 0.30738722055101736
1331
+ },
1332
+ {
1333
+ "epoch": 1.4749589673069246,
1334
+ "grad_norm": 3.520242929458618,
1335
+ "learning_rate": 0.0001895087241277893,
1336
+ "loss": 0.8109,
1337
+ "step": 33250
1338
+ },
1339
+ {
1340
+ "epoch": 1.4860488843543451,
1341
+ "grad_norm": 2.2728867530822754,
1342
+ "learning_rate": 0.00018935276332695343,
1343
+ "loss": 0.7985,
1344
+ "step": 33500
1345
+ },
1346
+ {
1347
+ "epoch": 1.4860488843543451,
1348
+ "eval_bleu": 0.40080037575712874,
1349
+ "step": 33500,
1350
+ "swisstext21_eval_bleu": 0.30433362941606407
1351
+ },
1352
+ {
1353
+ "epoch": 1.4971388014017655,
1354
+ "grad_norm": 2.971421718597412,
1355
+ "learning_rate": 0.0001891957169775023,
1356
+ "loss": 0.7932,
1357
+ "step": 33750
1358
+ },
1359
+ {
1360
+ "epoch": 1.508228718449186,
1361
+ "grad_norm": 2.5034966468811035,
1362
+ "learning_rate": 0.00018903758698739527,
1363
+ "loss": 0.7919,
1364
+ "step": 34000
1365
+ },
1366
+ {
1367
+ "epoch": 1.508228718449186,
1368
+ "eval_bleu": 0.40767117955379845,
1369
+ "step": 34000,
1370
+ "swisstext21_eval_bleu": 0.30671494308688607
1371
+ },
1372
+ {
1373
+ "epoch": 1.5193186354966066,
1374
+ "grad_norm": 2.4627349376678467,
1375
+ "learning_rate": 0.000188878375277757,
1376
+ "loss": 0.783,
1377
+ "step": 34250
1378
+ },
1379
+ {
1380
+ "epoch": 1.530408552544027,
1381
+ "grad_norm": 2.71600604057312,
1382
+ "learning_rate": 0.00018871808378285388,
1383
+ "loss": 0.8108,
1384
+ "step": 34500
1385
+ },
1386
+ {
1387
+ "epoch": 1.530408552544027,
1388
+ "eval_bleu": 0.394797486057362,
1389
+ "step": 34500,
1390
+ "swisstext21_eval_bleu": 0.3087865162589988
1391
+ },
1392
+ {
1393
+ "epoch": 1.5414984695914473,
1394
+ "grad_norm": 3.0158140659332275,
1395
+ "learning_rate": 0.00018855671445007076,
1396
+ "loss": 0.8127,
1397
+ "step": 34750
1398
+ },
1399
+ {
1400
+ "epoch": 1.552588386638868,
1401
+ "grad_norm": 2.9670796394348145,
1402
+ "learning_rate": 0.00018839426923988696,
1403
+ "loss": 0.8098,
1404
+ "step": 35000
1405
+ },
1406
+ {
1407
+ "epoch": 1.552588386638868,
1408
+ "eval_bleu": 0.3992581934406838,
1409
+ "step": 35000,
1410
+ "swisstext21_eval_bleu": 0.3025072689523983
1411
+ },
1412
+ {
1413
+ "epoch": 1.5636783036862885,
1414
+ "grad_norm": 2.3080224990844727,
1415
+ "learning_rate": 0.0001882307501258529,
1416
+ "loss": 0.7835,
1417
+ "step": 35250
1418
+ },
1419
+ {
1420
+ "epoch": 1.5747682207337088,
1421
+ "grad_norm": 3.082775831222534,
1422
+ "learning_rate": 0.0001880661590945657,
1423
+ "loss": 0.8369,
1424
+ "step": 35500
1425
+ },
1426
+ {
1427
+ "epoch": 1.5747682207337088,
1428
+ "eval_bleu": 0.4048045366265125,
1429
+ "step": 35500,
1430
+ "swisstext21_eval_bleu": 0.3005292082411897
1431
+ },
1432
+ {
1433
+ "epoch": 1.5858581377811294,
1434
+ "grad_norm": 2.1708056926727295,
1435
+ "learning_rate": 0.00018790049814564527,
1436
+ "loss": 0.7855,
1437
+ "step": 35750
1438
+ },
1439
+ {
1440
+ "epoch": 1.59694805482855,
1441
+ "grad_norm": 3.3110837936401367,
1442
+ "learning_rate": 0.00018773376929171005,
1443
+ "loss": 0.8195,
1444
+ "step": 36000
1445
+ },
1446
+ {
1447
+ "epoch": 1.59694805482855,
1448
+ "eval_bleu": 0.40120545805677965,
1449
+ "step": 36000,
1450
+ "swisstext21_eval_bleu": 0.30819612905632277
1451
+ },
1452
+ {
1453
+ "epoch": 1.6080379718759703,
1454
+ "grad_norm": 4.1916632652282715,
1455
+ "learning_rate": 0.0001875659745583524,
1456
+ "loss": 0.8054,
1457
+ "step": 36250
1458
+ },
1459
+ {
1460
+ "epoch": 1.6191278889233909,
1461
+ "grad_norm": 3.0245726108551025,
1462
+ "learning_rate": 0.00018739711598411406,
1463
+ "loss": 0.794,
1464
+ "step": 36500
1465
+ },
1466
+ {
1467
+ "epoch": 1.6191278889233909,
1468
+ "eval_bleu": 0.3979401149463107,
1469
+ "step": 36500,
1470
+ "swisstext21_eval_bleu": 0.3101184244616529
1471
+ },
1472
+ {
1473
+ "epoch": 1.6302178059708115,
1474
+ "grad_norm": 2.3407862186431885,
1475
+ "learning_rate": 0.00018722719562046141,
1476
+ "loss": 0.7961,
1477
+ "step": 36750
1478
+ },
1479
+ {
1480
+ "epoch": 1.6413077230182318,
1481
+ "grad_norm": 2.649019956588745,
1482
+ "learning_rate": 0.00018705621553176066,
1483
+ "loss": 0.787,
1484
+ "step": 37000
1485
+ },
1486
+ {
1487
+ "epoch": 1.6413077230182318,
1488
+ "eval_bleu": 0.4092604945594542,
1489
+ "step": 37000,
1490
+ "swisstext21_eval_bleu": 0.3129861922910357
1491
+ },
1492
+ {
1493
+ "epoch": 1.6523976400656522,
1494
+ "grad_norm": 3.1469483375549316,
1495
+ "learning_rate": 0.0001868841777952524,
1496
+ "loss": 0.8151,
1497
+ "step": 37250
1498
+ },
1499
+ {
1500
+ "epoch": 1.663487557113073,
1501
+ "grad_norm": 4.49592924118042,
1502
+ "learning_rate": 0.00018671108450102678,
1503
+ "loss": 0.7961,
1504
+ "step": 37500
1505
+ },
1506
+ {
1507
+ "epoch": 1.663487557113073,
1508
+ "eval_bleu": 0.40106080170968306,
1509
+ "step": 37500,
1510
+ "swisstext21_eval_bleu": 0.31557875920893397
1511
+ },
1512
+ {
1513
+ "epoch": 1.6745774741604933,
1514
+ "grad_norm": 2.7781357765197754,
1515
+ "learning_rate": 0.00018653693775199797,
1516
+ "loss": 0.8111,
1517
+ "step": 37750
1518
+ },
1519
+ {
1520
+ "epoch": 1.6856673912079136,
1521
+ "grad_norm": 3.807218313217163,
1522
+ "learning_rate": 0.00018636173966387842,
1523
+ "loss": 0.8126,
1524
+ "step": 38000
1525
+ },
1526
+ {
1527
+ "epoch": 1.6856673912079136,
1528
+ "eval_bleu": 0.4042668150834047,
1529
+ "step": 38000,
1530
+ "swisstext21_eval_bleu": 0.3078849028439319
1531
+ },
1532
+ {
1533
+ "epoch": 1.6967573082553342,
1534
+ "grad_norm": 3.1599435806274414,
1535
+ "learning_rate": 0.00018618549236515346,
1536
+ "loss": 0.7728,
1537
+ "step": 38250
1538
+ },
1539
+ {
1540
+ "epoch": 1.7078472253027548,
1541
+ "grad_norm": 2.7357876300811768,
1542
+ "learning_rate": 0.00018600819799705523,
1543
+ "loss": 0.7825,
1544
+ "step": 38500
1545
+ },
1546
+ {
1547
+ "epoch": 1.7078472253027548,
1548
+ "eval_bleu": 0.40671098856376503,
1549
+ "step": 38500,
1550
+ "swisstext21_eval_bleu": 0.30567470776676137
1551
+ },
1552
+ {
1553
+ "epoch": 1.7189371423501751,
1554
+ "grad_norm": 2.384441375732422,
1555
+ "learning_rate": 0.00018582985871353684,
1556
+ "loss": 0.7965,
1557
+ "step": 38750
1558
+ },
1559
+ {
1560
+ "epoch": 1.7300270593975957,
1561
+ "grad_norm": 2.4744198322296143,
1562
+ "learning_rate": 0.00018565047668124592,
1563
+ "loss": 0.8038,
1564
+ "step": 39000
1565
+ },
1566
+ {
1567
+ "epoch": 1.7300270593975957,
1568
+ "eval_bleu": 0.4112997039832828,
1569
+ "step": 39000,
1570
+ "swisstext21_eval_bleu": 0.31252497596244544
1571
+ },
1572
+ {
1573
+ "epoch": 1.7411169764450163,
1574
+ "grad_norm": 3.300543785095215,
1575
+ "learning_rate": 0.00018547005407949866,
1576
+ "loss": 0.8184,
1577
+ "step": 39250
1578
+ },
1579
+ {
1580
+ "epoch": 1.7522068934924366,
1581
+ "grad_norm": 3.401453733444214,
1582
+ "learning_rate": 0.00018528859310025305,
1583
+ "loss": 0.7883,
1584
+ "step": 39500
1585
+ },
1586
+ {
1587
+ "epoch": 1.7522068934924366,
1588
+ "eval_bleu": 0.403520670403573,
1589
+ "step": 39500,
1590
+ "swisstext21_eval_bleu": 0.31147494554557736
1591
+ },
1592
+ {
1593
+ "epoch": 1.7632968105398572,
1594
+ "grad_norm": 2.518880844116211,
1595
+ "learning_rate": 0.00018510609594808234,
1596
+ "loss": 0.7991,
1597
+ "step": 39750
1598
+ },
1599
+ {
1600
+ "epoch": 1.7743867275872778,
1601
+ "grad_norm": 1.9717811346054077,
1602
+ "learning_rate": 0.00018492256484014833,
1603
+ "loss": 0.7988,
1604
+ "step": 40000
1605
+ },
1606
+ {
1607
+ "epoch": 1.7743867275872778,
1608
+ "eval_bleu": 0.4059971537992098,
1609
+ "step": 40000,
1610
+ "swisstext21_eval_bleu": 0.3025584886012487
1611
+ },
1612
+ {
1613
+ "epoch": 1.7854766446346981,
1614
+ "grad_norm": 2.906183958053589,
1615
+ "learning_rate": 0.00018473800200617438,
1616
+ "loss": 0.7776,
1617
+ "step": 40250
1618
+ },
1619
+ {
1620
+ "epoch": 1.7965665616821185,
1621
+ "grad_norm": 2.767038106918335,
1622
+ "learning_rate": 0.0001845524096884182,
1623
+ "loss": 0.7906,
1624
+ "step": 40500
1625
+ },
1626
+ {
1627
+ "epoch": 1.7965665616821185,
1628
+ "eval_bleu": 0.40377715553875726,
1629
+ "step": 40500,
1630
+ "swisstext21_eval_bleu": 0.3042143768393741
1631
+ },
1632
+ {
1633
+ "epoch": 1.807656478729539,
1634
+ "grad_norm": 2.921994686126709,
1635
+ "learning_rate": 0.00018436579014164482,
1636
+ "loss": 0.8264,
1637
+ "step": 40750
1638
+ },
1639
+ {
1640
+ "epoch": 1.8187463957769596,
1641
+ "grad_norm": 1.9619659185409546,
1642
+ "learning_rate": 0.00018417814563309904,
1643
+ "loss": 0.8021,
1644
+ "step": 41000
1645
+ },
1646
+ {
1647
+ "epoch": 1.8187463957769596,
1648
+ "eval_bleu": 0.4111345004981505,
1649
+ "step": 41000,
1650
+ "swisstext21_eval_bleu": 0.31566120485475907
1651
+ },
1652
+ {
1653
+ "epoch": 1.82983631282438,
1654
+ "grad_norm": 2.688190221786499,
1655
+ "learning_rate": 0.00018398947844247804,
1656
+ "loss": 0.783,
1657
+ "step": 41250
1658
+ },
1659
+ {
1660
+ "epoch": 1.8409262298718005,
1661
+ "grad_norm": 3.442605972290039,
1662
+ "learning_rate": 0.00018379979086190345,
1663
+ "loss": 0.792,
1664
+ "step": 41500
1665
+ },
1666
+ {
1667
+ "epoch": 1.8409262298718005,
1668
+ "eval_bleu": 0.4071142980360359,
1669
+ "step": 41500,
1670
+ "swisstext21_eval_bleu": 0.30145837573503925
1671
+ },
1672
+ {
1673
+ "epoch": 1.8520161469192211,
1674
+ "grad_norm": 2.305250883102417,
1675
+ "learning_rate": 0.00018360908519589374,
1676
+ "loss": 0.7867,
1677
+ "step": 41750
1678
+ },
1679
+ {
1680
+ "epoch": 1.8631060639666415,
1681
+ "grad_norm": 3.7068357467651367,
1682
+ "learning_rate": 0.00018341736376133606,
1683
+ "loss": 0.7836,
1684
+ "step": 42000
1685
+ },
1686
+ {
1687
+ "epoch": 1.8631060639666415,
1688
+ "eval_bleu": 0.4077189407900903,
1689
+ "step": 42000,
1690
+ "swisstext21_eval_bleu": 0.31436959073853826
1691
+ },
1692
+ {
1693
+ "epoch": 1.874195981014062,
1694
+ "grad_norm": 2.5153627395629883,
1695
+ "learning_rate": 0.00018322462888745826,
1696
+ "loss": 0.7971,
1697
+ "step": 42250
1698
+ },
1699
+ {
1700
+ "epoch": 1.8852858980614826,
1701
+ "grad_norm": 2.1864585876464844,
1702
+ "learning_rate": 0.0001830308829158003,
1703
+ "loss": 0.7813,
1704
+ "step": 42500
1705
+ },
1706
+ {
1707
+ "epoch": 1.8852858980614826,
1708
+ "eval_bleu": 0.41077499870623996,
1709
+ "step": 42500,
1710
+ "swisstext21_eval_bleu": 0.3073002204386824
1711
+ },
1712
+ {
1713
+ "epoch": 1.896375815108903,
1714
+ "grad_norm": 2.703538179397583,
1715
+ "learning_rate": 0.00018283612820018617,
1716
+ "loss": 0.8096,
1717
+ "step": 42750
1718
+ },
1719
+ {
1720
+ "epoch": 1.9074657321563233,
1721
+ "grad_norm": 2.311037302017212,
1722
+ "learning_rate": 0.000182640367106695,
1723
+ "loss": 0.7695,
1724
+ "step": 43000
1725
+ },
1726
+ {
1727
+ "epoch": 1.9074657321563233,
1728
+ "eval_bleu": 0.4070896841434834,
1729
+ "step": 43000,
1730
+ "swisstext21_eval_bleu": 0.3008197269713232
1731
+ },
1732
+ {
1733
+ "epoch": 1.918555649203744,
1734
+ "grad_norm": 4.235742568969727,
1735
+ "learning_rate": 0.00018244360201363248,
1736
+ "loss": 0.7986,
1737
+ "step": 43250
1738
+ },
1739
+ {
1740
+ "epoch": 1.9296455662511645,
1741
+ "grad_norm": 2.413109540939331,
1742
+ "learning_rate": 0.0001822458353115019,
1743
+ "loss": 0.7802,
1744
+ "step": 43500
1745
+ },
1746
+ {
1747
+ "epoch": 1.9296455662511645,
1748
+ "eval_bleu": 0.3980846213733118,
1749
+ "step": 43500,
1750
+ "swisstext21_eval_bleu": 0.3008543870053332
1751
+ },
1752
+ {
1753
+ "epoch": 1.9407354832985848,
1754
+ "grad_norm": 2.620948314666748,
1755
+ "learning_rate": 0.00018204706940297508,
1756
+ "loss": 0.7653,
1757
+ "step": 43750
1758
+ },
1759
+ {
1760
+ "epoch": 1.9518254003460054,
1761
+ "grad_norm": 2.820128917694092,
1762
+ "learning_rate": 0.0001818473067028633,
1763
+ "loss": 0.7574,
1764
+ "step": 44000
1765
+ },
1766
+ {
1767
+ "epoch": 1.9518254003460054,
1768
+ "eval_bleu": 0.39885615292674625,
1769
+ "step": 44000,
1770
+ "swisstext21_eval_bleu": 0.2930218556668886
1771
+ },
1772
+ {
1773
+ "epoch": 1.962915317393426,
1774
+ "grad_norm": 2.394529342651367,
1775
+ "learning_rate": 0.00018164654963808783,
1776
+ "loss": 0.7538,
1777
+ "step": 44250
1778
+ },
1779
+ {
1780
+ "epoch": 1.9740052344408463,
1781
+ "grad_norm": 2.639223098754883,
1782
+ "learning_rate": 0.0001814448006476505,
1783
+ "loss": 0.8031,
1784
+ "step": 44500
1785
+ },
1786
+ {
1787
+ "epoch": 1.9740052344408463,
1788
+ "eval_bleu": 0.3975877229366723,
1789
+ "step": 44500,
1790
+ "swisstext21_eval_bleu": 0.3035613872071041
1791
+ },
1792
+ {
1793
+ "epoch": 1.9850951514882669,
1794
+ "grad_norm": 2.605180501937866,
1795
+ "learning_rate": 0.00018124206218260403,
1796
+ "loss": 0.7791,
1797
+ "step": 44750
1798
+ },
1799
+ {
1800
+ "epoch": 1.9961850685356874,
1801
+ "grad_norm": 3.3844354152679443,
1802
+ "learning_rate": 0.0001810383367060224,
1803
+ "loss": 0.7888,
1804
+ "step": 45000
1805
+ },
1806
+ {
1807
+ "epoch": 1.9961850685356874,
1808
+ "eval_bleu": 0.39963110189053974,
1809
+ "step": 45000,
1810
+ "swisstext21_eval_bleu": 0.3032512226222924
1811
+ },
1812
+ {
1813
+ "epoch": 2.007274985583108,
1814
+ "grad_norm": 2.150728225708008,
1815
+ "learning_rate": 0.0001808336266929707,
1816
+ "loss": 0.7689,
1817
+ "step": 45250
1818
+ },
1819
+ {
1820
+ "epoch": 2.018364902630528,
1821
+ "grad_norm": 2.487170696258545,
1822
+ "learning_rate": 0.00018062793463047522,
1823
+ "loss": 0.7396,
1824
+ "step": 45500
1825
+ },
1826
+ {
1827
+ "epoch": 2.018364902630528,
1828
+ "eval_bleu": 0.40990702580569705,
1829
+ "step": 45500,
1830
+ "swisstext21_eval_bleu": 0.30612631850382593
1831
+ },
1832
+ {
1833
+ "epoch": 2.029454819677949,
1834
+ "grad_norm": 2.625516891479492,
1835
+ "learning_rate": 0.00018042126301749315,
1836
+ "loss": 0.7552,
1837
+ "step": 45750
1838
+ },
1839
+ {
1840
+ "epoch": 2.0405447367253693,
1841
+ "grad_norm": 2.603152275085449,
1842
+ "learning_rate": 0.0001802136143648824,
1843
+ "loss": 0.7663,
1844
+ "step": 46000
1845
+ },
1846
+ {
1847
+ "epoch": 2.0405447367253693,
1848
+ "eval_bleu": 0.40981514096912186,
1849
+ "step": 46000,
1850
+ "swisstext21_eval_bleu": 0.31343081754143665
1851
+ },
1852
+ {
1853
+ "epoch": 2.0516346537727896,
1854
+ "grad_norm": 3.043926239013672,
1855
+ "learning_rate": 0.00018000499119537073,
1856
+ "loss": 0.756,
1857
+ "step": 46250
1858
+ },
1859
+ {
1860
+ "epoch": 2.0627245708202104,
1861
+ "grad_norm": 3.889756202697754,
1862
+ "learning_rate": 0.00017979539604352554,
1863
+ "loss": 0.7379,
1864
+ "step": 46500
1865
+ },
1866
+ {
1867
+ "epoch": 2.0627245708202104,
1868
+ "eval_bleu": 0.4098169578968794,
1869
+ "step": 46500,
1870
+ "swisstext21_eval_bleu": 0.30999706188596005
1871
+ },
1872
+ {
1873
+ "epoch": 2.073814487867631,
1874
+ "grad_norm": 3.5386924743652344,
1875
+ "learning_rate": 0.0001795848314557227,
1876
+ "loss": 0.7438,
1877
+ "step": 46750
1878
+ },
1879
+ {
1880
+ "epoch": 2.084904404915051,
1881
+ "grad_norm": 2.6188015937805176,
1882
+ "learning_rate": 0.00017937329999011592,
1883
+ "loss": 0.7797,
1884
+ "step": 47000
1885
+ },
1886
+ {
1887
+ "epoch": 2.084904404915051,
1888
+ "eval_bleu": 0.41246128940763466,
1889
+ "step": 47000,
1890
+ "swisstext21_eval_bleu": 0.30702184204438654
1891
+ },
1892
+ {
1893
+ "epoch": 2.095994321962472,
1894
+ "grad_norm": 2.863593578338623,
1895
+ "learning_rate": 0.0001791608042166054,
1896
+ "loss": 0.7603,
1897
+ "step": 47250
1898
+ },
1899
+ {
1900
+ "epoch": 2.1070842390098923,
1901
+ "grad_norm": 2.4948580265045166,
1902
+ "learning_rate": 0.00017894734671680684,
1903
+ "loss": 0.7707,
1904
+ "step": 47500
1905
+ },
1906
+ {
1907
+ "epoch": 2.1070842390098923,
1908
+ "eval_bleu": 0.40171245802431144,
1909
+ "step": 47500,
1910
+ "swisstext21_eval_bleu": 0.3080346062801903
1911
+ },
1912
+ {
1913
+ "epoch": 2.1181741560573126,
1914
+ "grad_norm": 4.000399589538574,
1915
+ "learning_rate": 0.0001787329300840199,
1916
+ "loss": 0.7699,
1917
+ "step": 47750
1918
+ },
1919
+ {
1920
+ "epoch": 2.129264073104733,
1921
+ "grad_norm": 3.1941723823547363,
1922
+ "learning_rate": 0.0001785175569231968,
1923
+ "loss": 0.7605,
1924
+ "step": 48000
1925
+ }
1926
+ ],
1927
+ "logging_steps": 250,
1928
+ "max_steps": 225430,
1929
+ "num_input_tokens_seen": 0,
1930
+ "num_train_epochs": 10,
1931
+ "save_steps": 500,
1932
+ "stateful_callbacks": {
1933
+ "EpochAwareEarlyStoppingCallback": {
1934
+ "args": {
1935
+ "early_stopping_patience": 3,
1936
+ "early_stopping_threshold": 0.0
1937
+ },
1938
+ "attributes": {
1939
+ "early_stopping_patience_counter": 0
1940
+ }
1941
+ },
1942
+ "TrainerControl": {
1943
+ "args": {
1944
+ "should_epoch_stop": false,
1945
+ "should_evaluate": true,
1946
+ "should_log": false,
1947
+ "should_save": true,
1948
+ "should_training_stop": false
1949
+ },
1950
+ "attributes": {}
1951
+ }
1952
+ },
1953
+ "total_flos": 2.555800421400576e+19,
1954
+ "train_batch_size": 8,
1955
+ "trial_name": null,
1956
+ "trial_params": null
1957
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d40040bf5b7766803b5c0ad132eedacb64fb9db2f6ca25d053d4a786309f6e7e
3
+ size 5905