BAAI
/

bge-reranker-v2.5-gemma2-lightweight

@@ -54,7 +54,7 @@ from transformers.utils import (
 from .gemma_config import CostWiseGemmaConfig
 from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm, Gemma2RotaryEmbedding, rotate_half, apply_rotary_pos_emb
 from transformers.models.gemma2.modeling_gemma2 import Gemma2MLP, repeat_kv, Gemma2Attention, Gemma2FlashAttention2, Gemma2SdpaAttention, GEMMA2_ATTENTION_CLASSES, Gemma2DecoderLayer, GEMMA2_START_DOCSTRING
-from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel, GEMMA2_INPUTS_DOCSTRING
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
@@ -77,6 +77,33 @@ def _get_unpad_data(attention_mask):
         max_seqlen_in_batch,
     )
 GEMMA2_ATTENTION_CLASSES = {
     "eager": Gemma2Attention,
@@ -213,7 +240,7 @@ def token_compress(compress_ratio,
     "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
     GEMMA2_START_DOCSTRING,
 )
-class CostWiseGemmaModel(Gemma2PreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmaDecoderLayer`]
@@ -466,10 +493,10 @@ class CostWiseHead(nn.Module):
         return self.linear_head(**kwargs)
-class CostWiseGemmaForCausalLM(Gemma2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config):
         super().__init__(config)
         self.model = CostWiseGemmaModel(config)
         self.vocab_size = config.vocab_size

 from .gemma_config import CostWiseGemmaConfig
 from transformers.models.gemma2.modeling_gemma2 import Gemma2RMSNorm, Gemma2RotaryEmbedding, rotate_half, apply_rotary_pos_emb
 from transformers.models.gemma2.modeling_gemma2 import Gemma2MLP, repeat_kv, Gemma2Attention, Gemma2FlashAttention2, Gemma2SdpaAttention, GEMMA2_ATTENTION_CLASSES, Gemma2DecoderLayer, GEMMA2_START_DOCSTRING
+from transformers.models.gemma2.modeling_gemma2 import GEMMA2_INPUTS_DOCSTRING
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
         max_seqlen_in_batch,
     )
+@add_start_docstrings(
+    "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
+    GEMMA2_START_DOCSTRING,
+)
+class CostWiseGemma2PreTrainedModel(PreTrainedModel):
+    config_class = CostWiseGemmaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Gemma2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = False
+    _supports_quantized_cache = False
+    _supports_static_cache = True
+    _is_stateful = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
 GEMMA2_ATTENTION_CLASSES = {
     "eager": Gemma2Attention,
     "The bare Gemma2 Model outputting raw hidden-states without any specific head on top.",
     GEMMA2_START_DOCSTRING,
 )
+class CostWiseGemmaModel(CostWiseGemma2PreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmaDecoderLayer`]
         return self.linear_head(**kwargs)
+class CostWiseGemmaForCausalLM(CostWiseGemma2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: CostWiseGemmaConfig):
         super().__init__(config)
         self.model = CostWiseGemmaModel(config)
         self.vocab_size = config.vocab_size