harryleafchen commited on
Commit
98d65cd
·
verified ·
1 Parent(s): d98ebe2

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
8
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
9
+ },
10
+ "bos_token_id": 151645,
11
+ "eos_token_id": 151645,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 2048,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 6144,
16
+ "max_position_embeddings": 131072,
17
+ "max_window_layers": 28,
18
+ "model_type": "qwen2",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 28,
21
+ "num_key_value_heads": 8,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "sliding_window": 131072,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.50.0",
29
+ "use_cache": true,
30
+ "use_sliding_window": false,
31
+ "vocab_size": 151936
32
+ }
configuration_qwen2.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen2 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class Qwen2Config(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
28
+ Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`Qwen2Model`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 32):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
55
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
56
+ The non-linear activation function (function or string) in the decoder.
57
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
58
+ The maximum sequence length that this model might ever be used with.
59
+ initializer_range (`float`, *optional*, defaults to 0.02):
60
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
61
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
62
+ The epsilon used by the rms normalization layers.
63
+ use_cache (`bool`, *optional*, defaults to `True`):
64
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
65
+ relevant if `config.is_decoder=True`.
66
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
67
+ Whether the model's input and output word embeddings should be tied.
68
+ rope_theta (`float`, *optional*, defaults to 10000.0):
69
+ The base period of the RoPE embeddings.
70
+ rope_scaling (`Dict`, *optional*):
71
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
72
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
73
+ accordingly.
74
+ Expected contents:
75
+ `rope_type` (`str`):
76
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
77
+ 'llama3'], with 'default' being the original RoPE implementation.
78
+ `factor` (`float`, *optional*):
79
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
80
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
81
+ original maximum pre-trained length.
82
+ `original_max_position_embeddings` (`int`, *optional*):
83
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
84
+ pretraining.
85
+ `attention_factor` (`float`, *optional*):
86
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
87
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
88
+ `factor` field to infer the suggested value.
89
+ `beta_fast` (`float`, *optional*):
90
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
91
+ ramp function. If unspecified, it defaults to 32.
92
+ `beta_slow` (`float`, *optional*):
93
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
94
+ ramp function. If unspecified, it defaults to 1.
95
+ `short_factor` (`List[float]`, *optional*):
96
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
97
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
98
+ size divided by the number of attention heads divided by 2
99
+ `long_factor` (`List[float]`, *optional*):
100
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
101
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
102
+ size divided by the number of attention heads divided by 2
103
+ `low_freq_factor` (`float`, *optional*):
104
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
105
+ `high_freq_factor` (`float`, *optional*):
106
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
107
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
108
+ Whether to use sliding window attention.
109
+ sliding_window (`int`, *optional*, defaults to 4096):
110
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
111
+ max_window_layers (`int`, *optional*, defaults to 28):
112
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
113
+ attention_dropout (`float`, *optional*, defaults to 0.0):
114
+ The dropout ratio for the attention probabilities.
115
+
116
+ ```python
117
+ >>> from transformers import Qwen2Model, Qwen2Config
118
+
119
+ >>> # Initializing a Qwen2 style configuration
120
+ >>> configuration = Qwen2Config()
121
+
122
+ >>> # Initializing a model from the Qwen2-7B style configuration
123
+ >>> model = Qwen2Model(configuration)
124
+
125
+ >>> # Accessing the model configuration
126
+ >>> configuration = model.config
127
+ ```"""
128
+
129
+ model_type = "qwen2"
130
+ keys_to_ignore_at_inference = ["past_key_values"]
131
+
132
+ # Default tensor parallel plan for base model `Qwen2`
133
+ base_model_tp_plan = {
134
+ "layers.*.self_attn.q_proj": "colwise",
135
+ "layers.*.self_attn.k_proj": "colwise",
136
+ "layers.*.self_attn.v_proj": "colwise",
137
+ "layers.*.self_attn.o_proj": "rowwise",
138
+ "layers.*.mlp.gate_proj": "colwise",
139
+ "layers.*.mlp.up_proj": "colwise",
140
+ "layers.*.mlp.down_proj": "rowwise",
141
+ }
142
+ base_model_pp_plan = {
143
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
144
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
145
+ "norm": (["hidden_states"], ["hidden_states"]),
146
+ }
147
+
148
+ def __init__(
149
+ self,
150
+ vocab_size=151936,
151
+ hidden_size=4096,
152
+ intermediate_size=22016,
153
+ num_hidden_layers=32,
154
+ num_attention_heads=32,
155
+ num_key_value_heads=32,
156
+ hidden_act="silu",
157
+ max_position_embeddings=32768,
158
+ initializer_range=0.02,
159
+ rms_norm_eps=1e-6,
160
+ use_cache=True,
161
+ tie_word_embeddings=False,
162
+ rope_theta=10000.0,
163
+ rope_scaling=None,
164
+ use_sliding_window=False,
165
+ sliding_window=4096,
166
+ max_window_layers=28,
167
+ attention_dropout=0.0,
168
+ **kwargs,
169
+ ):
170
+ self.vocab_size = vocab_size
171
+ self.max_position_embeddings = max_position_embeddings
172
+ self.hidden_size = hidden_size
173
+ self.intermediate_size = intermediate_size
174
+ self.num_hidden_layers = num_hidden_layers
175
+ self.num_attention_heads = num_attention_heads
176
+ self.use_sliding_window = use_sliding_window
177
+ self.sliding_window = sliding_window # we check `use_sliding_window` in the modeling code
178
+ self.max_window_layers = max_window_layers
179
+
180
+ # for backward compatibility
181
+ if num_key_value_heads is None:
182
+ num_key_value_heads = num_attention_heads
183
+
184
+ self.num_key_value_heads = num_key_value_heads
185
+ self.hidden_act = hidden_act
186
+ self.initializer_range = initializer_range
187
+ self.rms_norm_eps = rms_norm_eps
188
+ self.use_cache = use_cache
189
+ self.rope_theta = rope_theta
190
+ self.rope_scaling = rope_scaling
191
+ self.attention_dropout = attention_dropout
192
+ # Validate the correctness of rotary position embeddings parameters
193
+ # BC: if there is a 'type' field, move it to 'rope_type'.
194
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
195
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
196
+ rope_config_validation(self)
197
+
198
+ super().__init__(
199
+ tie_word_embeddings=tie_word_embeddings,
200
+ **kwargs,
201
+ )
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88426d1a19485b78a7a26f4a2df9deb455a8a616408ed366152c67c6ca7b9f19
3
+ size 4063751408
modeling_qwen2.py ADDED
@@ -0,0 +1,1170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # this file was adapted from transformers/models/qwen2/modeling_qwen2.py
3
+
4
+ from typing import Callable, List, Optional, Tuple, Union
5
+
6
+ import torch
7
+ from torch import nn
8
+
9
+ from transformers.activations import ACT2FN
10
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
11
+ from transformers.generation import GenerationMixin
12
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
13
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
14
+ from transformers.modeling_outputs import (
15
+ BaseModelOutputWithPast,
16
+ CausalLMOutputWithPast,
17
+ QuestionAnsweringModelOutput,
18
+ SequenceClassifierOutputWithPast,
19
+ TokenClassifierOutput,
20
+ )
21
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
22
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
23
+ from transformers.processing_utils import Unpack
24
+ from transformers.utils import (
25
+ add_code_sample_docstrings,
26
+ add_start_docstrings,
27
+ add_start_docstrings_to_model_forward,
28
+ logging,
29
+ replace_return_docstrings,
30
+ )
31
+ from transformers.utils.deprecation import deprecate_kwarg
32
+ from .configuration_qwen2 import Qwen2Config
33
+
34
+
35
+ logger = logging.get_logger(__name__)
36
+
37
+ _CHECKPOINT_FOR_DOC = "meta-qwen2/Qwen2-2-7b-hf"
38
+ _CONFIG_FOR_DOC = "Qwen2Config"
39
+
40
+
41
+ class Qwen2MLP(nn.Module):
42
+ def __init__(self, config):
43
+ super().__init__()
44
+ self.config = config
45
+ self.hidden_size = config.hidden_size
46
+ self.intermediate_size = config.intermediate_size
47
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
48
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
49
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
50
+ self.act_fn = ACT2FN[config.hidden_act]
51
+
52
+ def forward(self, x):
53
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
54
+ return down_proj
55
+
56
+
57
+ def rotate_half(x):
58
+ """Rotates half the hidden dims of the input."""
59
+ x1 = x[..., : x.shape[-1] // 2]
60
+ x2 = x[..., x.shape[-1] // 2 :]
61
+ return torch.cat((-x2, x1), dim=-1)
62
+
63
+
64
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
65
+ """Applies Rotary Position Embedding to the query and key tensors.
66
+
67
+ Args:
68
+ q (`torch.Tensor`): The query tensor.
69
+ k (`torch.Tensor`): The key tensor.
70
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
71
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
72
+ position_ids (`torch.Tensor`, *optional*):
73
+ Deprecated and unused.
74
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
75
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
76
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
77
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
78
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
79
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
80
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
81
+ Returns:
82
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
83
+ """
84
+ cos = cos.unsqueeze(unsqueeze_dim)
85
+ sin = sin.unsqueeze(unsqueeze_dim)
86
+ q_embed = (q * cos) + (rotate_half(q) * sin)
87
+ k_embed = (k * cos) + (rotate_half(k) * sin)
88
+ return q_embed, k_embed
89
+
90
+
91
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
92
+ """
93
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
94
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
95
+ """
96
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
97
+ if n_rep == 1:
98
+ return hidden_states
99
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
100
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
101
+
102
+
103
+ def eager_attention_forward(
104
+ module: nn.Module,
105
+ query: torch.Tensor,
106
+ key: torch.Tensor,
107
+ value: torch.Tensor,
108
+ attention_mask: Optional[torch.Tensor],
109
+ scaling: float,
110
+ dropout: float = 0.0,
111
+ **kwargs,
112
+ ):
113
+ key_states = repeat_kv(key, module.num_key_value_groups)
114
+ value_states = repeat_kv(value, module.num_key_value_groups)
115
+
116
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
117
+ if attention_mask is not None:
118
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
119
+ attn_weights = attn_weights + causal_mask
120
+
121
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
122
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
123
+ attn_output = torch.matmul(attn_weights, value_states)
124
+ attn_output = attn_output.transpose(1, 2).contiguous()
125
+
126
+ return attn_output, attn_weights
127
+
128
+
129
+ class Qwen2Attention(nn.Module):
130
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
131
+
132
+ def __init__(self, config: Qwen2Config, layer_idx: int):
133
+ super().__init__()
134
+ self.config = config
135
+ self.layer_idx = layer_idx
136
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
137
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
138
+ self.scaling = self.head_dim**-0.5
139
+ self.attention_dropout = config.attention_dropout
140
+ self.is_causal = True
141
+ self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
142
+ self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
143
+ self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
144
+ self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
145
+ self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
146
+ self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
147
+
148
+ def forward(
149
+ self,
150
+ hidden_states: torch.Tensor,
151
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
152
+ attention_mask: Optional[torch.Tensor],
153
+ past_key_value: Optional[Cache] = None,
154
+ cache_position: Optional[torch.LongTensor] = None,
155
+ **kwargs: Unpack[FlashAttentionKwargs],
156
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
157
+ input_shape = hidden_states.shape[:-1]
158
+ hidden_shape = (*input_shape, -1, self.head_dim)
159
+
160
+ query_states = self.q_proj(hidden_states).view(hidden_shape)
161
+ query_states = self.q_norm(query_states).transpose(1, 2)
162
+ key_states = self.k_proj(hidden_states).view(hidden_shape)
163
+ key_states = self.k_norm(key_states).transpose(1, 2)
164
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
165
+
166
+ cos, sin = position_embeddings
167
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
168
+
169
+ if past_key_value is not None:
170
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
171
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
172
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
173
+
174
+ sliding_window = None
175
+ if (
176
+ self.config.use_sliding_window
177
+ and getattr(self.config, "sliding_window", None) is not None
178
+ and self.layer_idx >= self.config.max_window_layers
179
+ ):
180
+ sliding_window = self.config.sliding_window
181
+
182
+ attention_interface: Callable = eager_attention_forward
183
+ if self.config._attn_implementation != "eager":
184
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
185
+ logger.warning_once(
186
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
187
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
188
+ )
189
+ else:
190
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
191
+
192
+ attn_output, attn_weights = attention_interface(
193
+ self,
194
+ query_states,
195
+ key_states,
196
+ value_states,
197
+ attention_mask,
198
+ dropout=0.0 if not self.training else self.attention_dropout,
199
+ scaling=self.scaling,
200
+ sliding_window=sliding_window, # main diff with Llama
201
+ **kwargs,
202
+ )
203
+
204
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
205
+ attn_output = self.o_proj(attn_output)
206
+ return attn_output, attn_weights
207
+
208
+
209
+ class Qwen2RMSNorm(nn.Module):
210
+ def __init__(self, hidden_size, eps=1e-6):
211
+ """
212
+ Qwen2RMSNorm is equivalent to T5LayerNorm
213
+ """
214
+ super().__init__()
215
+ self.weight = nn.Parameter(torch.ones(hidden_size))
216
+ self.variance_epsilon = eps
217
+
218
+ def forward(self, hidden_states):
219
+ input_dtype = hidden_states.dtype
220
+ hidden_states = hidden_states.to(torch.float32)
221
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
222
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
223
+ return self.weight * hidden_states.to(input_dtype)
224
+
225
+ def extra_repr(self):
226
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
227
+
228
+
229
+ class Qwen2DecoderLayer(nn.Module):
230
+ def __init__(self, config: Qwen2Config, layer_idx: int):
231
+ super().__init__()
232
+ self.hidden_size = config.hidden_size
233
+ self.self_attn = Qwen2Attention(config=config, layer_idx=layer_idx)
234
+ self.mlp = Qwen2MLP(config)
235
+ self.attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
236
+ self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
237
+ self.ffn_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
238
+ self.post_ffn_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
239
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
240
+ logger.warning_once(
241
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
242
+ "unexpected results may be encountered."
243
+ )
244
+
245
+ def forward(
246
+ self,
247
+ hidden_states: torch.Tensor,
248
+ attention_mask: Optional[torch.Tensor] = None,
249
+ position_ids: Optional[torch.LongTensor] = None,
250
+ past_key_value: Optional[Cache] = None,
251
+ output_attentions: Optional[bool] = False,
252
+ use_cache: Optional[bool] = False,
253
+ cache_position: Optional[torch.LongTensor] = None,
254
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
255
+ **kwargs: Unpack[FlashAttentionKwargs],
256
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
257
+ residual = hidden_states
258
+
259
+ hidden_states = self.attention_layernorm(hidden_states)
260
+
261
+ # Self Attention
262
+ hidden_states, self_attn_weights = self.self_attn(
263
+ hidden_states=hidden_states,
264
+ attention_mask=attention_mask,
265
+ position_ids=position_ids,
266
+ past_key_value=past_key_value,
267
+ output_attentions=output_attentions,
268
+ use_cache=use_cache,
269
+ cache_position=cache_position,
270
+ position_embeddings=position_embeddings,
271
+ **kwargs,
272
+ )
273
+ hidden_states = self.post_attention_layernorm(hidden_states)
274
+ hidden_states = residual + hidden_states
275
+
276
+ # Fully Connected
277
+ residual = hidden_states
278
+ hidden_states = self.ffn_layernorm(hidden_states)
279
+ hidden_states = self.mlp(hidden_states)
280
+ hidden_states = self.post_ffn_layernorm(hidden_states)
281
+ hidden_states = residual + hidden_states
282
+
283
+ outputs = (hidden_states,)
284
+ if output_attentions:
285
+ outputs += (self_attn_weights,)
286
+
287
+ return outputs
288
+
289
+
290
+ class Qwen2RotaryEmbedding(nn.Module):
291
+ def __init__(self, config: Qwen2Config, device=None):
292
+ super().__init__()
293
+ # BC: "rope_type" was originally "type"
294
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
295
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
296
+ else:
297
+ self.rope_type = "default"
298
+ self.max_seq_len_cached = config.max_position_embeddings
299
+ self.original_max_seq_len = config.max_position_embeddings
300
+
301
+ self.config = config
302
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
303
+
304
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
305
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
306
+ self.original_inv_freq = self.inv_freq
307
+
308
+ def _dynamic_frequency_update(self, position_ids, device):
309
+ """
310
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
311
+ 1 - growing beyond the cached sequence length (allow scaling)
312
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
313
+ """
314
+ seq_len = torch.max(position_ids) + 1
315
+ if seq_len > self.max_seq_len_cached: # growth
316
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
317
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
318
+ self.max_seq_len_cached = seq_len
319
+
320
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
321
+ # This .to() is needed if the model has been moved to a device after being initialized (because
322
+ # the buffer is automatically moved, but not the original copy)
323
+ self.original_inv_freq = self.original_inv_freq.to(device)
324
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
325
+ self.max_seq_len_cached = self.original_max_seq_len
326
+
327
+ @torch.no_grad()
328
+ def forward(self, x, position_ids):
329
+ if "dynamic" in self.rope_type:
330
+ self._dynamic_frequency_update(position_ids, device=x.device)
331
+
332
+ # Core RoPE block
333
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
334
+ position_ids_expanded = position_ids[:, None, :].float()
335
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
336
+ device_type = x.device.type
337
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
338
+ with torch.autocast(device_type=device_type, enabled=False):
339
+ freqs = (inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()).transpose(1, 2)
340
+ emb = torch.cat((freqs, freqs), dim=-1)
341
+ cos = emb.cos()
342
+ sin = emb.sin()
343
+
344
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
345
+ cos = cos * self.attention_scaling
346
+ sin = sin * self.attention_scaling
347
+
348
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
349
+
350
+
351
+ QWEN2_START_DOCSTRING = r"""
352
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
353
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
354
+ etc.)
355
+
356
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
357
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
358
+ and behavior.
359
+
360
+ Parameters:
361
+ config ([`Qwen2Config`]):
362
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
363
+ load the weights associated with the model, only the configuration. Check out the
364
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
365
+ """
366
+
367
+
368
+ @add_start_docstrings(
369
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
370
+ QWEN2_START_DOCSTRING,
371
+ )
372
+ class Qwen2PreTrainedModel(PreTrainedModel):
373
+ config_class = Qwen2Config
374
+ base_model_prefix = "model"
375
+ supports_gradient_checkpointing = True
376
+ _no_split_modules = ["Qwen2DecoderLayer"]
377
+ _skip_keys_device_placement = ["past_key_values"]
378
+ _supports_flash_attn_2 = True
379
+ _supports_sdpa = True
380
+ _supports_flex_attn = True
381
+ _supports_cache_class = True
382
+ _supports_quantized_cache = True
383
+ _supports_static_cache = True
384
+ _supports_attention_backend = True
385
+
386
+ def _init_weights(self, module):
387
+ std = self.config.initializer_range
388
+ if isinstance(module, nn.Linear):
389
+ module.weight.data.normal_(mean=0.0, std=std)
390
+ if module.bias is not None:
391
+ module.bias.data.zero_()
392
+ elif isinstance(module, nn.Embedding):
393
+ module.weight.data.normal_(mean=0.0, std=std)
394
+ if module.padding_idx is not None:
395
+ module.weight.data[module.padding_idx].zero_()
396
+
397
+
398
+ QWEN2_INPUTS_DOCSTRING = r"""
399
+ Args:
400
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
401
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
402
+ it.
403
+
404
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
405
+ [`PreTrainedTokenizer.__call__`] for details.
406
+
407
+ [What are input IDs?](../glossary#input-ids)
408
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
409
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
410
+
411
+ - 1 for tokens that are **not masked**,
412
+ - 0 for tokens that are **masked**.
413
+
414
+ [What are attention masks?](../glossary#attention-mask)
415
+
416
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
417
+ [`PreTrainedTokenizer.__call__`] for details.
418
+
419
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
420
+ `past_key_values`).
421
+
422
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
423
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
424
+ information on the default strategy.
425
+
426
+ - 1 indicates the head is **not masked**,
427
+ - 0 indicates the head is **masked**.
428
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
429
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
430
+ config.n_positions - 1]`.
431
+
432
+ [What are position IDs?](../glossary#position-ids)
433
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
434
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
435
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
436
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
437
+
438
+ Two formats are allowed:
439
+ - a [`~cache_utils.Cache`] instance, see our
440
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
441
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
442
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
443
+ cache format.
444
+
445
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
446
+ legacy cache format will be returned.
447
+
448
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
449
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
450
+ of shape `(batch_size, sequence_length)`.
451
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
452
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
453
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
454
+ model's internal embedding lookup matrix.
455
+ use_cache (`bool`, *optional*):
456
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
457
+ `past_key_values`).
458
+ output_attentions (`bool`, *optional*):
459
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
460
+ tensors for more detail.
461
+ output_hidden_states (`bool`, *optional*):
462
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
463
+ more detail.
464
+ return_dict (`bool`, *optional*):
465
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
466
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
467
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
468
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
469
+ the complete sequence length.
470
+ """
471
+
472
+
473
+ @add_start_docstrings(
474
+ "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
475
+ QWEN2_START_DOCSTRING,
476
+ )
477
+ class Qwen2Model(Qwen2PreTrainedModel):
478
+ """
479
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
480
+
481
+ Args:
482
+ config: Qwen2Config
483
+ """
484
+
485
+ def __init__(self, config: Qwen2Config):
486
+ super().__init__(config)
487
+ self.padding_idx = config.pad_token_id
488
+ self.vocab_size = config.vocab_size
489
+
490
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
491
+ self.layers = nn.ModuleList(
492
+ [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
493
+ )
494
+ self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
495
+ self.rotary_emb = Qwen2RotaryEmbedding(config=config)
496
+ self.gradient_checkpointing = False
497
+
498
+ # Initialize weights and apply final processing
499
+ self.post_init()
500
+
501
+ def get_input_embeddings(self):
502
+ return self.embed_tokens
503
+
504
+ def set_input_embeddings(self, value):
505
+ self.embed_tokens = value
506
+
507
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
508
+ def forward(
509
+ self,
510
+ input_ids: torch.LongTensor = None,
511
+ attention_mask: Optional[torch.Tensor] = None,
512
+ position_ids: Optional[torch.LongTensor] = None,
513
+ past_key_values: Optional[Cache] = None,
514
+ inputs_embeds: Optional[torch.FloatTensor] = None,
515
+ use_cache: Optional[bool] = None,
516
+ output_attentions: Optional[bool] = None,
517
+ output_hidden_states: Optional[bool] = None,
518
+ return_dict: Optional[bool] = None,
519
+ cache_position: Optional[torch.LongTensor] = None,
520
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
521
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
522
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
523
+ output_hidden_states = (
524
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
525
+ )
526
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
527
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
528
+
529
+ if (input_ids is None) ^ (inputs_embeds is not None):
530
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
531
+
532
+ if self.gradient_checkpointing and self.training and use_cache:
533
+ logger.warning_once(
534
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
535
+ )
536
+ use_cache = False
537
+
538
+ if inputs_embeds is None:
539
+ inputs_embeds = self.embed_tokens(input_ids)
540
+
541
+ if use_cache and past_key_values is None:
542
+ past_key_values = DynamicCache()
543
+
544
+ if cache_position is None:
545
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
546
+ cache_position = torch.arange(
547
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
548
+ )
549
+
550
+ if position_ids is None:
551
+ position_ids = cache_position.unsqueeze(0)
552
+
553
+ causal_mask = self._update_causal_mask(
554
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
555
+ )
556
+
557
+ hidden_states = inputs_embeds
558
+
559
+ # create position embeddings to be shared across the decoder layers
560
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
561
+
562
+ # decoder layers
563
+ all_hidden_states = () if output_hidden_states else None
564
+ all_self_attns = () if output_attentions else None
565
+
566
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
567
+ if output_hidden_states:
568
+ all_hidden_states += (hidden_states,)
569
+
570
+ if self.gradient_checkpointing and self.training:
571
+ layer_outputs = self._gradient_checkpointing_func(
572
+ decoder_layer.__call__,
573
+ hidden_states,
574
+ causal_mask,
575
+ position_ids,
576
+ past_key_values,
577
+ output_attentions,
578
+ use_cache,
579
+ cache_position,
580
+ position_embeddings,
581
+ )
582
+ else:
583
+ layer_outputs = decoder_layer(
584
+ hidden_states,
585
+ attention_mask=causal_mask,
586
+ position_ids=position_ids,
587
+ past_key_value=past_key_values,
588
+ output_attentions=output_attentions,
589
+ use_cache=use_cache,
590
+ cache_position=cache_position,
591
+ position_embeddings=position_embeddings,
592
+ **flash_attn_kwargs,
593
+ )
594
+
595
+ hidden_states = layer_outputs[0]
596
+
597
+ if output_attentions:
598
+ all_self_attns += (layer_outputs[1],)
599
+
600
+ hidden_states = self.norm(hidden_states)
601
+
602
+ # add hidden states from the last decoder layer
603
+ if output_hidden_states:
604
+ all_hidden_states += (hidden_states,)
605
+
606
+ output = BaseModelOutputWithPast(
607
+ last_hidden_state=hidden_states,
608
+ past_key_values=past_key_values if use_cache else None,
609
+ hidden_states=all_hidden_states,
610
+ attentions=all_self_attns,
611
+ )
612
+ return output if return_dict else output.to_tuple()
613
+
614
+ def _update_causal_mask(
615
+ self,
616
+ attention_mask: torch.Tensor,
617
+ input_tensor: torch.Tensor,
618
+ cache_position: torch.Tensor,
619
+ past_key_values: Cache,
620
+ output_attentions: bool,
621
+ ):
622
+ if self.config._attn_implementation == "flash_attention_2":
623
+ if attention_mask is not None and past_key_values is not None:
624
+ is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
625
+ if is_padding_right:
626
+ raise ValueError(
627
+ "You are attempting to perform batched generation with padding_side='right'"
628
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
629
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
630
+ )
631
+ if attention_mask is not None and 0.0 in attention_mask:
632
+ return attention_mask
633
+ return None
634
+
635
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
636
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
637
+ # to infer the attention mask.
638
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
639
+ using_static_cache = isinstance(past_key_values, StaticCache)
640
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
641
+
642
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
643
+ if (
644
+ self.config._attn_implementation == "sdpa"
645
+ and not (using_static_cache or using_sliding_window_cache)
646
+ and not output_attentions
647
+ ):
648
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
649
+ attention_mask,
650
+ inputs_embeds=input_tensor,
651
+ past_key_values_length=past_seen_tokens,
652
+ sliding_window=self.config.sliding_window,
653
+ is_training=self.training,
654
+ ):
655
+ return None
656
+
657
+ dtype, device = input_tensor.dtype, input_tensor.device
658
+ min_dtype = torch.finfo(dtype).min
659
+ sequence_length = input_tensor.shape[1]
660
+ # SlidingWindowCache or StaticCache
661
+ if using_sliding_window_cache or using_static_cache:
662
+ target_length = past_key_values.get_max_cache_shape()
663
+ # DynamicCache or no cache
664
+ else:
665
+ target_length = (
666
+ attention_mask.shape[-1]
667
+ if isinstance(attention_mask, torch.Tensor)
668
+ else past_seen_tokens + sequence_length + 1
669
+ )
670
+
671
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
672
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
673
+ attention_mask,
674
+ sequence_length=sequence_length,
675
+ target_length=target_length,
676
+ dtype=dtype,
677
+ device=device,
678
+ cache_position=cache_position,
679
+ batch_size=input_tensor.shape[0],
680
+ config=self.config,
681
+ past_key_values=past_key_values,
682
+ )
683
+
684
+ if (
685
+ self.config._attn_implementation == "sdpa"
686
+ and attention_mask is not None
687
+ and attention_mask.device.type in ["cuda", "xpu"]
688
+ and not output_attentions
689
+ ):
690
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
691
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
692
+ # Details: https://github.com/pytorch/pytorch/issues/110213
693
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
694
+
695
+ return causal_mask
696
+
697
+ @staticmethod
698
+ def _prepare_4d_causal_attention_mask_with_cache_position(
699
+ attention_mask: torch.Tensor,
700
+ sequence_length: int,
701
+ target_length: int,
702
+ dtype: torch.dtype,
703
+ device: torch.device,
704
+ cache_position: torch.Tensor,
705
+ batch_size: int,
706
+ config: Qwen2Config,
707
+ past_key_values: Cache,
708
+ ):
709
+ """
710
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
711
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
712
+
713
+ Args:
714
+ attention_mask (`torch.Tensor`):
715
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
716
+ sequence_length (`int`):
717
+ The sequence length being processed.
718
+ target_length (`int`):
719
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
720
+ dtype (`torch.dtype`):
721
+ The dtype to use for the 4D attention mask.
722
+ device (`torch.device`):
723
+ The device to place the 4D attention mask on.
724
+ cache_position (`torch.Tensor`):
725
+ Indices depicting the position of the input sequence tokens in the sequence.
726
+ batch_size (`torch.Tensor`):
727
+ Batch size.
728
+ config (`Qwen2Config`):
729
+ The model's configuration class
730
+ past_key_values (`Cache`):
731
+ The cache class that is being used currently to generate
732
+ """
733
+ if attention_mask is not None and attention_mask.dim() == 4:
734
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
735
+ causal_mask = attention_mask
736
+ else:
737
+ min_dtype = torch.finfo(dtype).min
738
+ causal_mask = torch.full(
739
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
740
+ )
741
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
742
+ if config.sliding_window is not None:
743
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
744
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
745
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
746
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
747
+ cache_position.reshape(-1, 1) - config.sliding_window
748
+ )
749
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
750
+ causal_mask *= diagonal_attend_mask
751
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
752
+ if attention_mask is not None:
753
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
754
+ if attention_mask.shape[-1] > target_length:
755
+ attention_mask = attention_mask[:, :target_length]
756
+ mask_length = attention_mask.shape[-1]
757
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
758
+ causal_mask.device
759
+ )
760
+ padding_mask = padding_mask == 0
761
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
762
+ padding_mask, min_dtype
763
+ )
764
+ return causal_mask
765
+
766
+ class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
767
+ _tied_weights_keys = ["lm_head.weight"]
768
+ _tp_plan = {"lm_head": "colwise_rep"}
769
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
770
+
771
+ def __init__(self, config):
772
+ super().__init__(config)
773
+ self.model = Qwen2Model(config)
774
+ self.vocab_size = config.vocab_size
775
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
776
+
777
+ # Initialize weights and apply final processing
778
+ self.post_init()
779
+
780
+ def get_input_embeddings(self):
781
+ return self.model.embed_tokens
782
+
783
+ def set_input_embeddings(self, value):
784
+ self.model.embed_tokens = value
785
+
786
+ def get_output_embeddings(self):
787
+ return self.lm_head
788
+
789
+ def set_output_embeddings(self, new_embeddings):
790
+ self.lm_head = new_embeddings
791
+
792
+ def set_decoder(self, decoder):
793
+ self.model = decoder
794
+
795
+ def get_decoder(self):
796
+ return self.model
797
+
798
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
799
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
800
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
801
+ def forward(
802
+ self,
803
+ input_ids: torch.LongTensor = None,
804
+ attention_mask: Optional[torch.Tensor] = None,
805
+ position_ids: Optional[torch.LongTensor] = None,
806
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
807
+ inputs_embeds: Optional[torch.FloatTensor] = None,
808
+ labels: Optional[torch.LongTensor] = None,
809
+ use_cache: Optional[bool] = None,
810
+ output_attentions: Optional[bool] = None,
811
+ output_hidden_states: Optional[bool] = None,
812
+ return_dict: Optional[bool] = None,
813
+ cache_position: Optional[torch.LongTensor] = None,
814
+ logits_to_keep: Union[int, torch.Tensor] = 0,
815
+ **kwargs,
816
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
817
+ r"""
818
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
819
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
820
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
821
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
822
+
823
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
824
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
825
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
826
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
827
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
828
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
829
+
830
+ Returns:
831
+
832
+ Example:
833
+
834
+ ```python
835
+ >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
836
+
837
+ >>> model = Qwen2ForCausalLM.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
838
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-qwen2/Qwen2-2-7b-hf")
839
+
840
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
841
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
842
+
843
+ >>> # Generate
844
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
845
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
846
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
847
+ ```"""
848
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
849
+ output_hidden_states = (
850
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
851
+ )
852
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
853
+
854
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
855
+ outputs = self.model(
856
+ input_ids=input_ids,
857
+ attention_mask=attention_mask,
858
+ position_ids=position_ids,
859
+ past_key_values=past_key_values,
860
+ inputs_embeds=inputs_embeds,
861
+ use_cache=use_cache,
862
+ output_attentions=output_attentions,
863
+ output_hidden_states=output_hidden_states,
864
+ return_dict=return_dict,
865
+ cache_position=cache_position,
866
+ **kwargs,
867
+ )
868
+
869
+ hidden_states = outputs[0]
870
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
871
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
872
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
873
+ logits = 30.0 * torch.tanh(logits / 30.0)
874
+
875
+ loss = None
876
+ if labels is not None:
877
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
878
+
879
+ if not return_dict:
880
+ output = (logits,) + outputs[1:]
881
+ return (loss,) + output if loss is not None else output
882
+
883
+ return CausalLMOutputWithPast(
884
+ loss=loss,
885
+ logits=logits,
886
+ past_key_values=outputs.past_key_values,
887
+ hidden_states=outputs.hidden_states,
888
+ attentions=outputs.attentions,
889
+ )
890
+
891
+
892
+ @add_start_docstrings(
893
+ """
894
+ The Qwen2 Model transformer with a sequence classification head on top (linear layer).
895
+
896
+ [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
897
+ (e.g. GPT-2) do.
898
+
899
+ Since it does classification on the last token, it requires to know the position of the last token. If a
900
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
901
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
902
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
903
+ each row of the batch).
904
+ """,
905
+ QWEN2_START_DOCSTRING,
906
+ )
907
+ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
908
+ def __init__(self, config):
909
+ super().__init__(config)
910
+ self.num_labels = config.num_labels
911
+ self.model = Qwen2Model(config)
912
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
913
+
914
+ # Initialize weights and apply final processing
915
+ self.post_init()
916
+
917
+ def get_input_embeddings(self):
918
+ return self.model.embed_tokens
919
+
920
+ def set_input_embeddings(self, value):
921
+ self.model.embed_tokens = value
922
+
923
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
924
+ def forward(
925
+ self,
926
+ input_ids: Optional[torch.LongTensor] = None,
927
+ attention_mask: Optional[torch.Tensor] = None,
928
+ position_ids: Optional[torch.LongTensor] = None,
929
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
930
+ inputs_embeds: Optional[torch.FloatTensor] = None,
931
+ labels: Optional[torch.LongTensor] = None,
932
+ use_cache: Optional[bool] = None,
933
+ output_attentions: Optional[bool] = None,
934
+ output_hidden_states: Optional[bool] = None,
935
+ return_dict: Optional[bool] = None,
936
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
937
+ r"""
938
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
939
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
940
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
941
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
942
+ """
943
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
944
+
945
+ transformer_outputs = self.model(
946
+ input_ids,
947
+ attention_mask=attention_mask,
948
+ position_ids=position_ids,
949
+ past_key_values=past_key_values,
950
+ inputs_embeds=inputs_embeds,
951
+ use_cache=use_cache,
952
+ output_attentions=output_attentions,
953
+ output_hidden_states=output_hidden_states,
954
+ return_dict=return_dict,
955
+ )
956
+ hidden_states = transformer_outputs[0]
957
+ logits = self.score(hidden_states)
958
+
959
+ if input_ids is not None:
960
+ batch_size = input_ids.shape[0]
961
+ else:
962
+ batch_size = inputs_embeds.shape[0]
963
+
964
+ if self.config.pad_token_id is None and batch_size != 1:
965
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
966
+ if self.config.pad_token_id is None:
967
+ last_non_pad_token = -1
968
+ elif input_ids is not None:
969
+ # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
970
+ non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
971
+ token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
972
+ last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
973
+ else:
974
+ last_non_pad_token = -1
975
+ logger.warning_once(
976
+ f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
977
+ "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
978
+ )
979
+
980
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
981
+
982
+ loss = None
983
+ if labels is not None:
984
+ loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
985
+
986
+ if not return_dict:
987
+ output = (pooled_logits,) + transformer_outputs[1:]
988
+ return ((loss,) + output) if loss is not None else output
989
+
990
+ return SequenceClassifierOutputWithPast(
991
+ loss=loss,
992
+ logits=pooled_logits,
993
+ past_key_values=transformer_outputs.past_key_values,
994
+ hidden_states=transformer_outputs.hidden_states,
995
+ attentions=transformer_outputs.attentions,
996
+ )
997
+
998
+
999
+ @add_start_docstrings(
1000
+ """
1001
+ The Qwen2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1002
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1003
+ """,
1004
+ QWEN2_START_DOCSTRING,
1005
+ )
1006
+ class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
1007
+ def __init__(self, config):
1008
+ super().__init__(config)
1009
+ self.num_labels = config.num_labels
1010
+ self.model = Qwen2Model(config)
1011
+ if getattr(config, "classifier_dropout", None) is not None:
1012
+ classifier_dropout = config.classifier_dropout
1013
+ elif getattr(config, "hidden_dropout", None) is not None:
1014
+ classifier_dropout = config.hidden_dropout
1015
+ else:
1016
+ classifier_dropout = 0.1
1017
+ self.dropout = nn.Dropout(classifier_dropout)
1018
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1019
+
1020
+ # Initialize weights and apply final processing
1021
+ self.post_init()
1022
+
1023
+ def get_input_embeddings(self):
1024
+ return self.model.embed_tokens
1025
+
1026
+ def set_input_embeddings(self, value):
1027
+ self.model.embed_tokens = value
1028
+
1029
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1030
+ @add_code_sample_docstrings(
1031
+ checkpoint=_CHECKPOINT_FOR_DOC,
1032
+ output_type=TokenClassifierOutput,
1033
+ config_class=_CONFIG_FOR_DOC,
1034
+ )
1035
+ def forward(
1036
+ self,
1037
+ input_ids: Optional[torch.LongTensor] = None,
1038
+ attention_mask: Optional[torch.Tensor] = None,
1039
+ position_ids: Optional[torch.LongTensor] = None,
1040
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1041
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1042
+ labels: Optional[torch.LongTensor] = None,
1043
+ use_cache: Optional[bool] = None,
1044
+ output_attentions: Optional[bool] = None,
1045
+ output_hidden_states: Optional[bool] = None,
1046
+ return_dict: Optional[bool] = None,
1047
+ ) -> Union[Tuple, TokenClassifierOutput]:
1048
+ r"""
1049
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1050
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1051
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1052
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1053
+ """
1054
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1055
+
1056
+ outputs = self.model(
1057
+ input_ids,
1058
+ attention_mask=attention_mask,
1059
+ position_ids=position_ids,
1060
+ past_key_values=past_key_values,
1061
+ inputs_embeds=inputs_embeds,
1062
+ use_cache=use_cache,
1063
+ output_attentions=output_attentions,
1064
+ output_hidden_states=output_hidden_states,
1065
+ return_dict=return_dict,
1066
+ )
1067
+ sequence_output = outputs[0]
1068
+ sequence_output = self.dropout(sequence_output)
1069
+ logits = self.score(sequence_output)
1070
+
1071
+ loss = None
1072
+ if labels is not None:
1073
+ loss = self.loss_function(logits, labels, self.config)
1074
+
1075
+ if not return_dict:
1076
+ output = (logits,) + outputs[2:]
1077
+ return ((loss,) + output) if loss is not None else output
1078
+
1079
+ return TokenClassifierOutput(
1080
+ loss=loss,
1081
+ logits=logits,
1082
+ hidden_states=outputs.hidden_states,
1083
+ attentions=outputs.attentions,
1084
+ )
1085
+
1086
+
1087
+ @add_start_docstrings(
1088
+ """
1089
+ The Qwen2 Model transformer with a span classification head on top for extractive question-answering tasks like
1090
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1091
+ """,
1092
+ QWEN2_START_DOCSTRING,
1093
+ )
1094
+ class Qwen2ForQuestionAnswering(Qwen2PreTrainedModel):
1095
+ base_model_prefix = "transformer"
1096
+
1097
+ def __init__(self, config):
1098
+ super().__init__(config)
1099
+ self.transformer = Qwen2Model(config)
1100
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1101
+
1102
+ # Initialize weights and apply final processing
1103
+ self.post_init()
1104
+
1105
+ def get_input_embeddings(self):
1106
+ return self.transformer.embed_tokens
1107
+
1108
+ def set_input_embeddings(self, value):
1109
+ self.transformer.embed_tokens = value
1110
+
1111
+ @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
1112
+ def forward(
1113
+ self,
1114
+ input_ids: Optional[torch.LongTensor] = None,
1115
+ attention_mask: Optional[torch.FloatTensor] = None,
1116
+ position_ids: Optional[torch.LongTensor] = None,
1117
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1118
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1119
+ start_positions: Optional[torch.LongTensor] = None,
1120
+ end_positions: Optional[torch.LongTensor] = None,
1121
+ output_attentions: Optional[bool] = None,
1122
+ output_hidden_states: Optional[bool] = None,
1123
+ return_dict: Optional[bool] = None,
1124
+ **kwargs,
1125
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1126
+ r"""
1127
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1128
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1129
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1130
+ are not taken into account for computing the loss.
1131
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1132
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1133
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1134
+ are not taken into account for computing the loss.
1135
+ """
1136
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1137
+
1138
+ outputs = self.transformer(
1139
+ input_ids,
1140
+ attention_mask=attention_mask,
1141
+ position_ids=position_ids,
1142
+ past_key_values=past_key_values,
1143
+ inputs_embeds=inputs_embeds,
1144
+ output_attentions=output_attentions,
1145
+ output_hidden_states=output_hidden_states,
1146
+ return_dict=return_dict,
1147
+ )
1148
+
1149
+ sequence_output = outputs[0]
1150
+
1151
+ logits = self.qa_outputs(sequence_output)
1152
+ start_logits, end_logits = logits.split(1, dim=-1)
1153
+ start_logits = start_logits.squeeze(-1).contiguous()
1154
+ end_logits = end_logits.squeeze(-1).contiguous()
1155
+
1156
+ loss = None
1157
+ if start_positions is not None and end_positions is not None:
1158
+ loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
1159
+
1160
+ if not return_dict:
1161
+ output = (start_logits, end_logits) + outputs[2:]
1162
+ return ((loss,) + output) if loss is not None else output
1163
+
1164
+ return QuestionAnsweringModelOutput(
1165
+ loss=loss,
1166
+ start_logits=start_logits,
1167
+ end_logits=end_logits,
1168
+ hidden_states=outputs.hidden_states,
1169
+ attentions=outputs.attentions,
1170
+ )
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": ["<|im_start|>", "<|im_end|>"],
30
+ "bos_token": null,
31
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "<|endoftext|>",
34
+ "errors": "replace",
35
+ "model_max_length": 32768,
36
+ "pad_token": "<|endoftext|>",
37
+ "split_special_tokens": false,
38
+ "tokenizer_class": "Qwen2Tokenizer",
39
+ "unk_token": null
40
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff