fix image size unchangeable

#32

by JDEdisonChen - opened Jul 12, 2024

base: refs/heads/main

←

from: refs/pr/32

Discussion Files changed

+243

-49

This PR is in draft mode

Files changed (7) hide show

README.md +1 -6
README_en.md +1 -5
config.json +1 -1
generation_config.json +1 -1
modeling_chatglm.py +231 -16
tokenization_chatglm.py +0 -3
visual.py +8 -17

README.md CHANGED Viewed

@@ -19,8 +19,6 @@ inference: false
 Read this in [English](README_en.md)
-**2024/08/12, 本仓库代码已更新并使用 `transforemrs>=4.44.0`, 请及时更新依赖。**
 GLM-4V-9B 是智谱 AI 推出的最新一代预训练模型 GLM-4 系列中的开源多模态版本。
 **GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多轮对话能力，在中英文综合能力、感知推理、文字识别、图表理解等多方面多模态评测中，GLM-4V-9B 表现出超越 GPT-4-turbo-2024-04-09、Gemini
 1.0 Pro、Qwen-VL-Max 和 Claude 3 Opus 的卓越性能。
@@ -48,10 +46,7 @@ GLM-4V-9B 是一个多模态语言模型，具备视觉理解能力，其相关
 ## 运行模型
-**更多推理代码和依赖信息，请访问我们的 [github](https://github.com/THUDM/GLM-4)。**
-**请严格按照[依赖](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt)安装，否则无法正常运行。**
-。
 ```python
 import torch

 Read this in [English](README_en.md)
 GLM-4V-9B 是智谱 AI 推出的最新一代预训练模型 GLM-4 系列中的开源多模态版本。
 **GLM-4V-9B** 具备 1120 * 1120 高分辨率下的中英双语多轮对话能力，在中英文综合能力、感知推理、文字识别、图表理解等多方面多模态评测中，GLM-4V-9B 表现出超越 GPT-4-turbo-2024-04-09、Gemini
 1.0 Pro、Qwen-VL-Max 和 Claude 3 Opus 的卓越性能。
 ## 运行模型
+更多推理代码和依赖信息，请访问我们的 [github](https://github.com/THUDM/GLM-4) 。
 ```python
 import torch

README_en.md CHANGED Viewed

@@ -1,7 +1,5 @@
 # GLM-4V-9B
-**2024/08/12, The repository code has been updated and now requires `transformers>=4.44.0`. Please update your dependencies accordingly.**
 GLM-4V-9B is an open source multimodal version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.
 **GLM-4V-9B** has the ability to conduct multi-round conversations in Chinese and English at a high resolution of 1120 * 1120. In multimodal evaluations of comprehensive Chinese and English abilities, perceptual reasoning, text recognition, and chart understanding, GLM-4V-9B has shown superior performance over GPT-4-turbo-2024-04-09, Gemini
 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus.
@@ -31,9 +29,7 @@ GLM-4V-9B is a multimodal language model with visual understanding capabilities.
 ## Quick Start
-**For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).**
-**Please strictly follow the [dependencies](https://github.com/THUDM/GLM-4/blob/main/basic_demo/requirements.txt) to install, otherwise it will not run properly**
 ```python

 # GLM-4V-9B
 GLM-4V-9B is an open source multimodal version of the latest generation of pre-trained models in the GLM-4 series launched by Zhipu AI.
 **GLM-4V-9B** has the ability to conduct multi-round conversations in Chinese and English at a high resolution of 1120 * 1120. In multimodal evaluations of comprehensive Chinese and English abilities, perceptual reasoning, text recognition, and chart understanding, GLM-4V-9B has shown superior performance over GPT-4-turbo-2024-04-09, Gemini
 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus.
 ## Quick Start
+For more inference code and requirements, please visit our [github page](https://github.com/THUDM/GLM-4).
 ```python

config.json CHANGED Viewed

@@ -50,7 +50,7 @@
   "seq_length": 8192,
   "use_cache": true,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.0",
   "tie_word_embeddings": false,
   "eos_token_id": [151329, 151336, 151338],
   "pad_token_id": 151329,

   "seq_length": 8192,
   "use_cache": true,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.2",
   "tie_word_embeddings": false,
   "eos_token_id": [151329, 151336, 151338],
   "pad_token_id": 151329,

generation_config.json CHANGED Viewed

@@ -9,5 +9,5 @@
   "temperature": 0.8,
   "max_length": 8192,
   "top_p": 0.8,
-  "transformers_version": "4.44.0"
 }

   "temperature": 0.8,
   "max_length": 8192,
   "top_p": 0.8,
+  "transformers_version": "4.40.2"
 }

modeling_chatglm.py CHANGED Viewed

@@ -1,13 +1,18 @@
-""" PyTorch GLM-4V model. """
 import math
 import sys
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from torch import nn
 from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
 from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Dict, Any
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -848,6 +853,11 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
         batch_size, seq_length = input_ids.shape
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
@@ -1082,22 +1092,21 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             outputs: ModelOutput,
             model_kwargs: Dict[str, Any],
             is_encoder_decoder: bool = False,
-            num_new_tokens: int = 1,
     ) -> Dict[str, Any]:
-        for possible_cache_name in ["past_key_values", "mems", "past_buckets_states", "cache_params"]:
-            if hasattr(outputs, possible_cache_name):
-                if possible_cache_name in ("past_buckets_states", "mems"):
-                    cache_name = "past_key_values"
-                else:
-                    cache_name = possible_cache_name
-                model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
-                break
         if "attention_mask" in model_kwargs:
             attention_mask = model_kwargs["attention_mask"]
             model_kwargs["attention_mask"] = torch.cat(
                 [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
             )
         if "position_ids" in model_kwargs:
             position_ids = model_kwargs["position_ids"]
             new_position_id = position_ids[..., -1:].clone()
@@ -1105,11 +1114,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             model_kwargs["position_ids"] = torch.cat(
                 [position_ids, new_position_id], dim=-1
             )
-        model_kwargs["is_first_forward"] = False
-        if model_kwargs.get("use_cache", True) and "cache_position" in model_kwargs:
-            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
         return model_kwargs
     def prepare_inputs_for_generation(
@@ -1198,6 +1204,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
         loss = None
         if labels is not None:
             new_labels = []
             for i in range(len(input_ids)):
                 input_id = input_ids[i].tolist()
@@ -1209,12 +1216,16 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
                     (
                         labels[i, :boi_token_pos + 1],
                         torch.tensor([-100]).to(labels.device).to(labels.dtype).repeat(1600),
-                        labels[i, eoi_token_pos:])))
             labels = torch.stack(new_labels, dim=0)
             lm_logits = lm_logits.to(torch.float32)
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
@@ -1252,6 +1263,210 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
             for layer_past in past
         )
 class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
     def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
         super().__init__(config)

+""" PyTorch ChatGLM model. """
+import json
 import math
+import copy
+import warnings
 import sys
 import torch
 import torch.utils.checkpoint
 import torch.nn.functional as F
 from torch import nn
 from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
 from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
         batch_size, seq_length = input_ids.shape
         position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
 class Embedding(torch.nn.Module):
     """Language model embeddings."""
             outputs: ModelOutput,
             model_kwargs: Dict[str, Any],
             is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        # update attention mask
         if "attention_mask" in model_kwargs:
             attention_mask = model_kwargs["attention_mask"]
             model_kwargs["attention_mask"] = torch.cat(
                 [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
             )
+        # update position ids
         if "position_ids" in model_kwargs:
             position_ids = model_kwargs["position_ids"]
             new_position_id = position_ids[..., -1:].clone()
             model_kwargs["position_ids"] = torch.cat(
                 [position_ids, new_position_id], dim=-1
             )
+        model_kwargs["is_first_forward"] = False
         return model_kwargs
     def prepare_inputs_for_generation(
         loss = None
         if labels is not None:
+            # https://github.com/THUDM/GLM-4/issues/264
             new_labels = []
             for i in range(len(input_ids)):
                 input_id = input_ids[i].tolist()
                     (
                         labels[i, :boi_token_pos + 1],
                         torch.tensor([-100]).to(labels.device).to(labels.dtype).repeat(1600),
+                        labels[i, eoi_token_pos:])))  # 在两个token之间加入
             labels = torch.stack(new_labels, dim=0)
             lm_logits = lm_logits.to(torch.float32)
+            # Shift so that tokens < n predict n
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
             for layer_past in past
         )
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            if "\n" in response:
+                metadata, content = response.split("\n", maxsplit=1)
+            else:
+                metadata, content = "", response
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    parameters = json.loads(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", image=None,
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        message = {"role": role, "content": query}
+        if image is not None:
+            message["image"] = image
+        history.append(message)
+        inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True,
+                                               return_tensors="pt", return_dict=True)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
+                        tokenizer.convert_tokens_to_ids("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        response, history = self.process_response(response, history)
+        return response, history
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", image=None,
+                    past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
+                        tokenizer.convert_tokens_to_ids("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        message = {"role": role, "content": "query"}
+        if image is not None:
+            message["image"] = image
+        if past_key_values is None:
+            inputs = tokenizer.apply_chat_template(history + [message],
+                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
+                                                   return_dict=True)
+        else:
+            inputs = tokenizer.apply_chat_template([message], add_special_tokens=False,
+                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
+                                                   return_dict=True)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
 class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
     def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
         super().__init__(config)

tokenization_chatglm.py CHANGED Viewed

@@ -54,8 +54,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
         super().__init__(
             padding_side=padding_side,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            encode_special_tokens=encode_special_tokens,
-            image_size=image_size,
             **kwargs
         )
@@ -305,7 +303,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
             max_length: Optional[int] = None,
             padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
             pad_to_multiple_of: Optional[int] = None,
-            padding_side: Optional[str] = None,
             return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """

         super().__init__(
             padding_side=padding_side,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs
         )
             max_length: Optional[int] = None,
             padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
             pad_to_multiple_of: Optional[int] = None,
             return_attention_mask: Optional[bool] = None,
     ) -> dict:
         """

visual.py CHANGED Viewed

@@ -6,7 +6,6 @@ from transformers.activations import ACT2FN
 import math
 from torch.nn import LayerNorm
 def standard_attention(query_layer, key_layer, value_layer, scaling_attention_score=True):
     if scaling_attention_score:
         query_layer = query_layer / math.sqrt(query_layer.shape[-1])
@@ -17,12 +16,11 @@ def standard_attention(query_layer, key_layer, value_layer, scaling_attention_sc
     context_layer = torch.matmul(attention_probs, value_layer)
     return context_layer
 def attention_fn_default(query_layer, key_layer, value_layer, scaling_attention_score=True):
     if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score:
         # Pytorch 2.0 attention uses very much memory if attention_mask is float, and has NaN bug if attention_mask is None.
         attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_layer, key_layer, value_layer,
             attn_mask=None,
             dropout_p=0.,
             is_causal=False
@@ -33,12 +31,10 @@ def attention_fn_default(query_layer, key_layer, value_layer, scaling_attention_
             query_layer, key_layer, value_layer, scaling_attention_score=scaling_attention_score
         )
 class PatchEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.proj = nn.Conv2d(config.in_channels, config.hidden_size, kernel_size=config.patch_size,
-                              stride=config.patch_size)
         self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
         self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size)
@@ -66,7 +62,7 @@ class Attention(nn.Module):
         qkv = self.query_key_value(x)
         qkv = qkv.reshape(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)  # 3, B, H, L, D
         q, k, v = qkv[0], qkv[1], qkv[2]
         out = attention_fn_default(
             q, k, v
         )
@@ -109,9 +105,7 @@ class TransformerLayer(nn.Module):
         attention_output = self.input_layernorm(self.attention(attention_input))
         hidden_states = attention_input + attention_output
         mlp_input = hidden_states
-        # https://github.com/THUDM/GLM-4/issues/350
-        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)).to(mlp_input.device)
         output = mlp_input + mlp_output
         return output
@@ -153,8 +147,7 @@ class EVA2CLIPModel(nn.Module):
         self.patch_embedding = PatchEmbedding(vision_config)
         self.transformer = Transformer(vision_config)
         self.linear_proj = GLU(config, in_features=config.hidden_size)
-        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2,
-                              stride=2)
         self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.scaling_factor = vision_config.scaling_factor
@@ -165,16 +158,14 @@ class EVA2CLIPModel(nn.Module):
         x = x[:, 1:]
         b, s, h = x.shape
-        grid_size = int(s ** 0.5)
         x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
         x = self.conv(x)
         x = x.flatten(2).transpose(1, 2)
         x = self.linear_proj(x)
-        # https://github.com/THUDM/GLM-4/issues/350
-        boi = self.boi.expand(x.shape[0], -1, -1).to(x.device)
-        eoi = self.eoi.expand(x.shape[0], -1, -1).to(x.device)
         x = torch.cat((boi, x, eoi), dim=1)
         x = x / self.scaling_factor
         return x

 import math
 from torch.nn import LayerNorm
 def standard_attention(query_layer, key_layer, value_layer, scaling_attention_score=True):
     if scaling_attention_score:
         query_layer = query_layer / math.sqrt(query_layer.shape[-1])
     context_layer = torch.matmul(attention_probs, value_layer)
     return context_layer
 def attention_fn_default(query_layer, key_layer, value_layer, scaling_attention_score=True):
     if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score:
         # Pytorch 2.0 attention uses very much memory if attention_mask is float, and has NaN bug if attention_mask is None.
         attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer, key_layer, value_layer,
             attn_mask=None,
             dropout_p=0.,
             is_causal=False
             query_layer, key_layer, value_layer, scaling_attention_score=scaling_attention_score
         )
 class PatchEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.proj = nn.Conv2d(config.in_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size)
         self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
         self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size)
         qkv = self.query_key_value(x)
         qkv = qkv.reshape(B, L, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)  # 3, B, H, L, D
         q, k, v = qkv[0], qkv[1], qkv[2]
         out = attention_fn_default(
             q, k, v
         )
         attention_output = self.input_layernorm(self.attention(attention_input))
         hidden_states = attention_input + attention_output
         mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
         output = mlp_input + mlp_output
         return output
         self.patch_embedding = PatchEmbedding(vision_config)
         self.transformer = Transformer(vision_config)
         self.linear_proj = GLU(config, in_features=config.hidden_size)
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, out_channels=config.hidden_size, kernel_size=2, stride=2)
         self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
         self.scaling_factor = vision_config.scaling_factor
         x = x[:, 1:]
         b, s, h = x.shape
+        grid_size = int(s**0.5)
         x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
         x = self.conv(x)
         x = x.flatten(2).transpose(1, 2)
         x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
         x = torch.cat((boi, x, eoi), dim=1)
         x = x / self.scaling_factor
         return x