initial upload

Browse files

Files changed (9) hide show

__init__.py +25 -0
config.json +40 -0
configuration_transformer_rnn.py +75 -0
generation_config.json +8 -0
model.safetensors +3 -0
modeling_transformer_rnn.py +628 -0
nirvana_1_3B.json +28 -0
task_aware_delta_net.py +754 -0
ttt_cross_layer.py +340 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*- coding: utf-8 -*-
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from fla.models.transformer.configuration_transformer import TransformerConfig
+from fla.models.transformer.modeling_transformer import (
+    TransformerForCausalLM, TransformerModel)
+from .configuration_transformer_rnn import TransformerConfig_rnn
+from .modeling_transformer_rnn import TransformerForCausalLM_rnn, TransformerModel_rnn
+from .task_aware_delta_net import Task_Aware_Delta_Net
+from .ttt_cross_layer import TTT_Cross_Layer
+AutoConfig.register(TransformerConfig.model_type, TransformerConfig)
+AutoModel.register(TransformerConfig, TransformerModel)
+AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM)
+AutoConfig.register(TransformerConfig_rnn.model_type, TransformerConfig_rnn)
+AutoModel.register(TransformerConfig_rnn, TransformerModel_rnn)
+AutoModelForCausalLM.register(TransformerConfig_rnn, TransformerForCausalLM_rnn)
+__all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel',
+           'TransformerConfig_rnn', 'TransformerForCausalLM_rnn', 'TransformerModel_rnn',
+           'Task_Aware_Delta_Net', 'TTT_Cross_Layer']

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "/cpfs02/user/jiangyuhua/flash-linear-attention/training/configs/nirvana_1.3B-t3/nirvana_1_3B.json",
+  "architectures": [
+    "TransformerForCausalLM_rnn"
+  ],
+  "attention_bias": false,
+  "auto_map": {
+    "AutoConfig": "configuration_transformer_rnn.TransformerConfig_rnn",
+    "AutoModel": "modeling_transformer_rnn.TransformerModel_rnn",
+    "AutoModelForCausalLM": "modeling_transformer_rnn.TransformerForCausalLM_rnn"
+  },
+  "bos_token_id": 1,
+  "concept_dim": 64,
+  "elementwise_affine": true,
+  "eos_token_id": 2,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 2048,
+  "initializer_range": 0.006,
+  "intermediate_size": null,
+  "logit_dim": 32,
+  "max_position_embeddings": 32768,
+  "model_type": "transformer_rnn",
+  "norm_eps": 1e-06,
+  "norm_first": false,
+  "num_heads": 16,
+  "num_hidden_layers": 16,
+  "num_kv_heads": null,
+  "pad_token_id": 2,
+  "recurrent_depth": 4,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "use_cache": false,
+  "vocab_size": 128512,
+  "window_size": 2048
+}

configuration_transformer_rnn.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# -*- coding: utf-8 -*-
+from typing import Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+# AutoConfig
+class TransformerConfig_rnn(PretrainedConfig):
+    model_type = 'transformer_rnn'
+    keys_to_ignore_at_inference = ['past_key_values']
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        hidden_size: int = 2048,
+        num_hidden_layers: int = 24,
+        num_heads: int = 32,
+        num_kv_heads: int = None,
+        window_size: Optional[int] = None,
+        rope_theta: Optional[float] = 10000.,
+        max_position_embeddings: int = 2048,
+        hidden_ratio: Optional[int] = 4,
+        intermediate_size: Optional[int] = None,
+        hidden_act: str = "swish",
+        initializer_range: float = 0.02,
+        elementwise_affine: Optional[bool] = True,
+        norm_first: bool = False,
+        norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        pad_token_id: int = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        attention_bias: bool = False,
+        fuse_norm: bool = True,
+        fuse_cross_entropy: bool = True,
+        recurrent_depth: int = 4,
+        concept_dim: int = 128,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.window_size = window_size
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.elementwise_affine = elementwise_affine
+        self.norm_first = norm_first
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.attention_bias = attention_bias
+        self.fuse_cross_entropy = fuse_cross_entropy
+        self.fuse_norm = fuse_norm
+        self.recurrent_depth = recurrent_depth
+        self.concept_dim = concept_dim
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.46.0",
+  "use_cache": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9c8b64a92f136b61f25043ccd79b6b14efd5ca5287a4f3ea185c5c19bd39bcf
+size 3226267140

modeling_transformer_rnn.py ADDED Viewed

	@@ -0,0 +1,628 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import math
+import warnings
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers.activations import ACT2FN
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from dataclasses import dataclass
+from transformers.utils import ModelOutput
+@dataclass
+class BaseModelOutputWithPast_with_two_caches(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values1: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    all_past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class CausalLMOutputWithPast_with_two_caches(ModelOutput):
+    logits: torch.FloatTensor = None
+    loss: Optional[torch.FloatTensor] = None
+    past_key_values1: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    all_past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+# from fla.layers.attn import Attention
+from configuration_transformer_rnn import TransformerConfig_rnn
+import sys
+import os
+# # 添加当前目录的上上级目录到 Python 路径
+# current_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# sys.path.append(current_dir)
+# sys.path.append("/cpfs02/user/jiangyuhua/flash-linear-attention/fla/layers")
+# from attn_rnn import Attention_rnn ###########################################################
+# from attn_svd import Attention_svd ###########################################################
+# from attn import Attention         ###########################################################
+# from gated_deltanet import GatedDeltaNet ###########################################################
+# from rwkv7 import RWKV7Attention ###########################################################
+# from attn_gated_delta import GatedDeltaNet_attention ###########################################################
+# from scattering_mixer2 import Scattering_Mixer ###########################################################
+from task_aware_delta_net import Task_Aware_Delta_Net ###########################################################
+# from moe_rnn import CustomGRUCell, CustomRNNCell
+from ttt_cross_layer import TTT_Cross_Layer
+# from fla.models.transformer.configuration_transformer import TransformerConfig
+from fla.models.utils import Cache
+from fla.modules import (FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss,
+                         RMSNorm)
+from fla.modules.activations import swiglu_linear
+from fla.modules.layernorm import rms_norm_linear
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+logger = logging.get_logger(__name__)
+class TransformerMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_ratio: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        hidden_act: str = 'swish',
+        norm_first: bool = True,
+        norm_eps: float = 1e-5
+    ) -> TransformerMLP:
+        super().__init__()
+        self.hidden_size = hidden_size
+        # the final number of params is `hidden_ratio * hidden_size^2`
+        # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio`
+        if hidden_ratio is None:
+            hidden_ratio = 4
+        if intermediate_size is None:
+            intermediate_size = int(hidden_size * hidden_ratio * 2 / 3)
+            intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256)
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.norm_first = norm_first
+        if norm_first:
+            self.norm = RMSNorm(hidden_size=hidden_size, eps=norm_eps)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(
+        self,
+        x: torch.Tensor,
+        **kwargs: Unpack[Any]
+    ) -> torch.Tensor:
+        if self.norm_first:
+            x = rms_norm_linear(x, self.norm.weight, self.norm.bias, self.gate_proj.weight, self.gate_proj.bias)
+        else:
+            x = self.gate_proj(x)
+        gate, y = x.chunk(2, -1)
+        return swiglu_linear(gate, y, self.down_proj.weight, self.down_proj.bias)
+class TransformerMLP_svd(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_ratio: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        hidden_act: str = 'swish',
+        norm_first: bool = True,
+        norm_eps: float = 1e-5
+    ) -> TransformerMLP_svd:
+        super().__init__()
+        self.hidden_size = hidden_size
+        # the final number of params is `hidden_ratio * hidden_size^2`
+        # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio`
+        if hidden_ratio is None:
+            hidden_ratio = 4
+        if intermediate_size is None:
+            intermediate_size = int(hidden_size * hidden_ratio * 2 / 3)
+            intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256)
+        self.hidden_ratio = hidden_ratio
+        self.intermediate_size = intermediate_size
+        self.norm_first = norm_first
+        if norm_first:
+            self.norm = RMSNorm(hidden_size=hidden_size, eps=norm_eps)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+        self.reflector_qkvo = nn.Linear(self.intermediate_size, self.hidden_size * 4)
+    def forward(
+        self,
+        x: torch.Tensor,
+        reflect: bool = False,
+        **kwargs: Unpack[Any]
+    ) -> torch.Tensor:
+        if self.norm_first:
+            x = rms_norm_linear(x, self.norm.weight, self.norm.bias, self.gate_proj.weight, self.gate_proj.bias)
+        else:
+            x = self.gate_proj(x)
+        gate, y = x.chunk(2, -1)
+        hidden_states = swiglu_linear(gate, y, self.down_proj.weight, self.down_proj.bias)
+        if reflect:
+            reflector_qkvo = swiglu_linear(gate, y, self.reflector_qkvo.weight, self.reflector_qkvo.bias)
+            reflector_qkvo = nn.Sigmoid()(reflector_qkvo)
+            return hidden_states, reflector_qkvo
+        else:
+            return hidden_states
+class TransformerBlock_rnn(nn.Module):
+    def __init__(self, config: TransformerConfig_rnn, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        if not config.norm_first:
+            self.attn_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps)
+        self.head_dim = config.hidden_size // config.num_heads
+        self.Task_Aware_Delta_Net = Task_Aware_Delta_Net(
+            hidden_size=config.hidden_size,
+            head_dim=self.head_dim,
+            num_heads=config.num_heads,
+            mode='chunk',
+            rope_theta=config.rope_theta,
+            max_position_embeddings=config.max_position_embeddings,
+            norm_first=config.norm_first,
+            norm_eps=config.norm_eps,
+            layer_idx=layer_idx,
+            concept_dim=config.concept_dim
+        )
+        # use_ttt = True
+        # if use_ttt:
+        #     self.rnn_router = TTT_Cross_Layer(config)
+        # else:
+        #     self.rnn_router = CustomGRUCell(config)
+        if not config.norm_first:
+            self.mlp_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps)
+        self.mlp = TransformerMLP(
+            hidden_size=config.hidden_size,
+            hidden_ratio=config.hidden_ratio,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            norm_first=config.norm_first,
+            norm_eps=config.norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values1: Optional[Tuple[torch.Tensor]] = None,
+        all_past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        h_old: Optional[torch.Tensor] = None,
+        rnn_router: Optional[nn.Module] = None,
+        params: Optional[Dict] = None,
+        **kwargs: Unpack[Any]
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        if hasattr(self, 'attn_norm'):
+            hidden_states = self.attn_norm(hidden_states)
+        hidden_states, attentions, past_key_values1, all_past_key_values, h_new, params = self.Task_Aware_Delta_Net(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_values1=past_key_values1,
+            all_past_key_values=all_past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            rnn_router=rnn_router,
+            h_old=h_old,
+            params=params,
+            **kwargs
+        )
+            # if self.rnn_router is not None:
+            #     hidden_states = self.rnn_router(hidden_states, **kwargs)
+        if hasattr(self, 'mlp_norm'):
+            hidden_states, residual = self.mlp_norm(hidden_states, residual, True)
+        else:
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+        hidden_states = self.mlp(hidden_states, **kwargs)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attentions,)
+        if use_cache:
+            outputs += (past_key_values1, all_past_key_values)
+        outputs += (h_new,)
+        outputs += (params,)
+        return outputs
+class TransformerPreTrainedModel_rnn(PreTrainedModel):
+    config_class = TransformerConfig_rnn
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['TransformerBlock_rnn']
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(
+        self,
+        module: nn.Module,
+        rescale_prenorm_residual: bool = False,
+        num_residuals_per_layer: int = 2,
+    ):
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif hasattr(module, 'reset_parameters'):
+            module.reset_parameters()
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["o_proj.weight", "down_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    with torch.no_grad():
+                        p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers)
+class TransformerModel_rnn(TransformerPreTrainedModel_rnn):
+    def __init__(
+        self,
+        config: TransformerConfig_rnn
+    ) -> TransformerModel_rnn:
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.concept_dim = config.concept_dim
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([TransformerBlock_rnn(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.gradient_checkpointing = False
+        self.post_init()
+        self.rnn_router = TTT_Cross_Layer(config)
+    def get_input_embeddings(self):
+        return self.embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values1: Optional[List[torch.FloatTensor]] = None,
+        all_past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[Any]
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if output_attentions:
+            warnings.warn(
+                "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`."
+            )
+            output_attentions = False
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if use_cache and not isinstance(past_key_values1, Cache):
+            past_key_values1 = Cache.from_legacy_cache(past_key_values1)
+        if use_cache and not isinstance(all_past_key_values, Cache):
+            all_past_key_values = Cache.from_legacy_cache(all_past_key_values)
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+        # embed positions
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_hidden_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        next_cache1 = None
+        next_cache2 = None
+        h_old = None
+        params = None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    past_key_values1,
+                    all_past_key_values,
+                    output_attentions,
+                    use_cache,
+                    h_old=h_old,
+                    params=params,
+                    rnn_router=self.rnn_router,
+                    **kwargs
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    past_key_values1=past_key_values1,
+                    all_past_key_values=all_past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    h_old=h_old,
+                    params=params,
+                    rnn_router=self.rnn_router,
+                    **kwargs
+                )
+            hidden_states = layer_outputs[0]
+            h_old = layer_outputs[-2]
+            params = layer_outputs[-1]
+            if use_cache:
+                next_cache1 = layer_outputs[2 if output_attentions else 1]
+                next_cache2 = layer_outputs[3 if output_attentions else 2]
+            if output_attentions:
+                all_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache1, all_hidden_states, all_attns] if v is not None)
+        # return BaseModelOutputWithPast_with_two_caches(
+        #     last_hidden_state=hidden_states,
+        #     past_key_values1=next_cache1,
+        #     all_past_key_values=next_cache2,
+        #     hidden_states=all_hidden_states,
+        #     attentions=all_attns
+        # )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache1,
+            hidden_states=all_hidden_states,
+            attentions=all_attns
+        )
+class TransformerForCausalLM_rnn(TransformerPreTrainedModel_rnn, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = TransformerModel_rnn(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embeddings
+    def set_input_embeddings(self, value):
+        self.model.embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor = None,
+        past_key_values1: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        all_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: bool = True,
+        num_logits_to_keep: Optional[int] = None,
+        **kwargs
+    ):
+        # only last token for `inputs_ids` if the `past_key_values` is passed along.
+        if past_key_values1 is not None:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values1 is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard.
+            # Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {'input_ids': input_ids.contiguous()}
+        if num_logits_to_keep is not None:
+            model_inputs['num_logits_to_keep'] = num_logits_to_keep
+        # model_inputs.update({
+        #     'past_key_values1': past_key_values1,
+        #     'all_past_key_values': all_past_key_values,
+        #     'use_cache': use_cache,
+        #     'attention_mask': attention_mask,
+        #     'num_logits_to_keep': num_logits_to_keep,
+        # })
+        model_inputs.update({
+            'past_key_values1': past_key_values1,
+            'use_cache': use_cache,
+            'attention_mask': attention_mask,
+            'num_logits_to_keep': num_logits_to_keep,
+        })
+        return model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values1: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        all_past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: Optional[int] = 0,
+        **kwargs: Unpack[Any]
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values1=past_key_values1,
+            all_past_key_values=all_past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs
+        )
+        hidden_states = outputs[0]
+        fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
+        logits = None if fuse_linear_and_cross_entropy else self.lm_head(hidden_states[:, -num_logits_to_keep:])
+        loss = None
+        if labels is not None:
+            if self.config.fuse_cross_entropy:
+                if fuse_linear_and_cross_entropy:
+                    loss_fct = FusedLinearCrossEntropyLoss()
+                else:
+                    loss_fct = FusedCrossEntropyLoss(inplace_backward=True)
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+            # Enable model parallelism
+            # labels = labels.to(hidden_states.device)
+            # labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], loss_fct.ignore_index)), 1)
+            if fuse_linear_and_cross_entropy:
+                loss = loss_fct(hidden_states.view(-1, self.config.hidden_size),
+                                labels.view(-1),
+                                self.lm_head.weight,
+                                self.lm_head.bias)
+            else:
+                loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        # return CausalLMOutputWithPast_with_two_caches(
+        #     loss=loss,
+        #     logits=logits,
+        #     past_key_values1=outputs.past_key_values1,
+        #     all_past_key_values=outputs.all_past_key_values,
+        #     hidden_states=outputs.hidden_states,
+        #     attentions=outputs.attentions,
+        # )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+if __name__ == '__main__':
+    config = TransformerConfig_rnn(
+        concept_dim=128,
+        attention_bias=False,
+        bos_token_id=1,
+        eos_token_id=2,
+        fuse_cross_entropy=True,
+        fuse_norm=True,
+        hidden_act="swish",
+        hidden_size=1024,
+        initializer_range=0.02,
+        max_position_embeddings=8192,
+        model_type="transformer_rnn",
+        num_heads=16,
+        num_hidden_layers=24,
+        norm_eps=1e-06,
+        tie_word_embeddings=True,
+        use_cache=True,
+        vocab_size=32000,
+    )
+    model = TransformerForCausalLM_rnn(config).cuda().to(torch.bfloat16)
+    input_ids = torch.randint(0, 100, (2, 70)).cuda()
+    attention_mask = torch.ones_like(input_ids).cuda()
+    output = model(input_ids, attention_mask=attention_mask)
+    print(output)
+    print(output.loss)
+    print(output.logits)
+    print(output.all_past_key_values)
+    print(output.hidden_states)
+    print(output.attentions)

nirvana_1_3B.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "attention_bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "pad_token_id": 2,
+    "auto_map": {
+        "AutoConfig": "configuration_transformer_rnn.TransformerConfig_rnn",
+        "AutoModel": "modeling_transformer_rnn.TransformerModel_rnn",
+        "AutoModelForCausalLM": "modeling_transformer_rnn.TransformerForCausalLM_rnn"
+      },
+    "fuse_cross_entropy": true,
+    "fuse_norm": true,
+    "hidden_act": "swish",
+    "hidden_size": 2048,
+    "initializer_range": 6e-3,
+    "max_position_embeddings": 32768,
+    "rope_theta": 10000.0,
+    "model_type": "transformer_rnn",
+    "num_heads": 16,
+    "num_hidden_layers": 16,
+    "norm_eps": 1e-06,
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "vocab_size": 128512,
+    "concept_dim": 64,
+    "logit_dim": 32,
+    "window_size": 2048
+}

task_aware_delta_net.py ADDED Viewed

	@@ -0,0 +1,754 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Songlin Yang, Yu Zhang
+from __future__ import annotations
+import math
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn import functional as F
+from fla.modules import FusedRMSNormSwishGate, RMSNorm, ShortConvolution
+from fla.ops.gated_delta_rule import (chunk_gated_delta_rule,
+                                      fused_recurrent_gated_delta_rule)
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+def elu_p1(x):
+    return (F.elu(x, 1., False) + 1.).to(x)
+def sum_norm(x):
+    return (x / x.sum(-1, keepdim=True)).to(x)
+from fla.modules import RMSNorm, RotaryEmbedding
+if TYPE_CHECKING:
+    from fla.models.utils import Cache
+import warnings
+try:
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import (index_first_axis, pad_input,
+                                         unpad_input)
+except ImportError:
+    warnings.warn(
+        "Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`",
+        category=ImportWarning
+    )
+    flash_attn_func = None
+# https://github.com/IDSIA/recurrent-fwp/blob/master/algorithmic/layers.py#L86C1-L146C1
+def lambda_init_fn(depth):
+    return 0.8 - 0.6 * math.exp(-0.3 * depth)
+# -*- coding: utf-8 -*-
+from typing import Optional, Tuple
+import torch
+from einops import rearrange
+from fla.ops.linear_attn.utils import normalize_output
+# def scattering_mixer(
+#     q: torch.Tensor,
+#     k: torch.Tensor,
+#     v: torch.Tensor,
+#     gamma: torch.Tensor,
+#     # chi: torch.Tensor,
+#     scale: Optional[float] = None,
+#     normalize: bool = False
+# ) -> Tuple[torch.Tensor, torch.Tensor]:
+#     if scale is None:
+#         scale = q.shape[-1] ** -0.5
+#     chunk_size = 64
+#     # split_size = 2
+#     q = rearrange(q, 'b (n c) h d -> b h n c d', c=chunk_size) * scale
+#     # k = rearrange(k, 'b h (n c) d -> b h n c d', c=chunk_size)
+#     # gamma (b , n*c, h) -> (b, h, n*c, 1)
+#     gamma = rearrange(gamma, 'b l h -> b h l').unsqueeze(-1)
+#     gamma_cumprod = torch.cumprod(gamma, dim=2)
+#     gamma_cumprod_chunk = rearrange(gamma_cumprod, 'b h (n c) d -> b h n c d', c=chunk_size)
+#     gamma_cumprod_chunk = gamma_cumprod_chunk[:, :, :, -1, :].unsqueeze(-2) # [b, h, n, 1, 1]
+#     gamma_cumprod = rearrange(gamma_cumprod, 'b h l d -> b l h d')
+#     k_cumprod = k / gamma_cumprod
+#     k = rearrange(k, 'b (n c) h d -> b h n c d', c=chunk_size)
+#     k_cumprod_chunk = rearrange(k_cumprod, 'b (n c) h d -> b h n c d', c=chunk_size)
+#     # gamma_cumprod_chunk = rearrange(gamma_cumprod, 'b h n c d -> b h (n c) d')
+#     v = rearrange(v, 'b (n c) h d -> b h n c d', c=chunk_size)
+#     gamma = rearrange(gamma, 'b h (n c) d -> b h n c d', c=chunk_size) # d = 1
+#     # gamma_cumprod_chunk_inter = torch.cumprod(gamma, dim=3)
+#     gamma_inter = torch.cumprod(gamma, dim=3) # [b, h, n, c, 1]
+#     kv = k_cumprod_chunk.transpose(-1, -2) @ v # [b, h, n, d, d]
+#     kv = kv.cumsum(2) # [b, h, n, d, d]   n << seq_len
+#     kv = kv * gamma_cumprod_chunk # [b, h, n, d, d]
+#     kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2) # [b, h, n, d, d]
+#     inter = (q @ kv) * gamma_inter # [b, h, n, c, d]
+#     intra = (
+#         ((q @ (k / gamma_inter).transpose(-1, -2)) ).masked_fill_(
+#         torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1),
+#         0
+#     )) @ v * gamma_inter  # [b, h, n, c, d]
+#     o = inter + intra # [b, h, n, c, d]
+#     if normalize:
+#         o = normalize_output(q * scale, k, o)
+#     return rearrange(o, 'b h n c d -> b (n c) h d') , None
+def scattering_mixer_recurrent(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    G0: torch.Tensor,
+    split_size: int,
+    past_kv: Optional[torch.Tensor] = None,
+    beta: Optional[torch.Tensor] = None,
+    # chi: torch.Tensor,
+    scale: Optional[float] = None,
+    normalize: bool = False,
+    order: int = 2,
+    perturb: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if scale is None:
+        scale = q.shape[-1] ** -0.5
+    # chunk_size = 64
+    q = rearrange(q, 'b l h (f s) -> b h l s f', s=split_size) * scale
+    k = rearrange(k, 'b l h (f s) -> b h l s f', s=split_size)
+    v = rearrange(v, 'b l h (d s) -> b h l s d', s=split_size)
+    if order == 2:
+        G0 = rearrange(G0, 'b l h d f -> b h l d f')
+        # kv = k.transpose(-1, -2) @ v # [b, h, l, f, d]
+        second_term = torch.einsum('b h l s d, b h l d f -> b h l s f', v, G0) # [b, h, l, s, f]
+        G1 = second_term @ k.transpose(-1, -2) # [b, h, l, s, s]
+        kv2 = k.transpose(-1, -2) @ G1 + k.transpose(-1, -2) # [b, h, l, f ,s]
+    else:
+        kv2 = k.transpose(-1, -2) # [b, h, l, f ,s]
+    kv = kv2 @ v # [b, h, l, f, d]
+    # kv = kv + kv2
+    perturb = rearrange(perturb, 'b l h f k -> b h l f k') # [b, h, l, f, f]
+    M = q.transpose(-1, -2) @ q # [b, h, l, f, f]
+    M = perturb @ M # [b, h, l, f, f]
+    M = q @ M # [b, h, l, s, f]
+    q = q + M # [b, h, l, s, f]
+    if past_kv is None:
+        if beta is not None:
+            beta = rearrange(beta, 'b l h -> b h l')
+            beta_cumprod = torch.cumprod(beta, dim=2)
+            # print('the shape of beta_cumprod', beta_cumprod.shape)
+            beta_cumprod = torch.cat([torch.ones_like(beta_cumprod[:, :, :1]), beta_cumprod[:, :, :-1]], dim=2)
+            # kv = kv + kv2
+            beta_cumprod = rearrange(beta_cumprod, 'b h l -> b h l 1 1')
+            kv = kv / beta_cumprod # [b, h, l, f, d]
+            kv = kv.cumsum(2) # [b, h, l, f, d]
+            kv = kv * beta_cumprod # [b, h, l, f, d]
+        else:
+            kv = kv.cumsum(2) # [b, h, l, f, d]
+        o = q @ kv # [b, h, l, s, d]
+    else:
+        if beta is not None:
+            beta = rearrange(beta, 'b l h -> b h l')
+            kv = kv[:, :, -1, :, :] + past_kv * (beta[:, :, -2]).unsqueeze(-1).unsqueeze(-1)
+        else:
+            kv = kv[:, :, -1, :, :] + past_kv # [b, h, l, f, d]
+        o = q @ kv # [b, h, l, s, d]
+        # print('the shape of o', o.shape)
+    if normalize:
+        o = normalize_output(q * scale, k, o) # [b, h, l, s, d]
+    return rearrange(o, 'b h l s d -> b l h (s d)') , kv
+def safe_exp(x):
+    return torch.exp(x - torch.max(x,dim=-1,keepdim=True)[0])
+def random_proj(q, down_proj_matrix, up_proj_matrix, control_vec):
+    temp = q @ down_proj_matrix
+    temp = temp * control_vec
+    temp = temp @ up_proj_matrix
+    return torch.concat([torch.cos(temp), torch.sin(temp)], dim=-1)
+def lora_proj(x, down_proj_matrix, up_proj_matrix, control_vec):
+    temp = x @ down_proj_matrix
+    temp = temp * control_vec
+    temp = temp @ up_proj_matrix
+    return temp
+def gaussian_basis(x, basis_a, basis_c, basis_h):
+    # x.shape = [b, q_len, channel]
+    x = x.unsqueeze(-1) # [b, q_len, channel, 1]
+    # basis_a.shape = [b, q_len, 1, num_basis]
+    # basis_c.shape = [b, q_len, 1, num_basis]
+    # basis_h.shape = [b, q_len, 1, num_basis]
+    eps = 1e-6
+    temp = F.sigmoid(basis_a) * torch.exp(-(x - basis_c) ** 2 / (2 * basis_h ** 2 + eps)) # [b, q_len, channel, num_basis]
+    # temp = F.sigmoid(basis_a) * torch.exp(-(x - basis_c) ** 2 * (basis_h ** 2) ) # [b, q_len, channel, num_basis]
+    return temp.sum(dim=-1, keepdim=False) # [b, q_len, channel]
+def pad_time_cond(t, len):
+    t_sin = torch.cat([torch.sin(w * t) for w in range(1, len + 1)], dim=-1)
+    t_cos = torch.cat([torch.cos(w * t) for w in range(1, len + 1)], dim=-1)
+    t = torch.cat([t_sin, t_cos, t], dim=-1)
+    return t
+class condition_interpolation(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        concept_dim: int = 64,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.concept_dim = concept_dim
+        self.r = 8
+        # self.len = 15
+        self.lora = nn.Sequential(
+            nn.Linear(self.hidden_size * 2 + self.concept_dim * 2, self.hidden_size // self.r, bias=False),
+            nn.SiLU(),
+            nn.Linear(self.hidden_size // self.r, self.hidden_size, bias=False)
+        )
+        nn.init.xavier_uniform_(self.lora[0].weight)
+        nn.init.zeros_(self.lora[2].weight)
+    def forward(self, start, end, h_new):
+        # t = pad_time_cond(t, self.len)
+        x = torch.cat([start, end, h_new, h_new], dim=-1)
+        x = self.lora(x)
+        return x
+class Task_Aware_Delta_Net(nn.Module):
+    """
+    The layer implementaion for [Gated Delta Networks: Improving Mamba2 with Delta Rule](https://arxiv.org/abs/2412.06464).  # noqa
+    Similar to Mamba2, each layer contains around 6*hidden_size*hidden_size parameters.
+    Parameter alloation when use_gate=True:
+        - 0.75 * hidden_size * hidden_size for the q_proj and k_proj each
+        - 1.5 * hidden_size * hidden_size for the v_proj, g_proj and o_proj each
+        - Others are ignorably small.
+        - In total = 0.75 * 2 + 1.5 * 3 = 6 * hidden_size * hidden_size
+    NOTE: num_heads * head_dim = 0.75 * hidden_size, please make sure to set the correct num_heads and head_dim.
+    Parameter allocation when use_gate=False:
+        - 1 * hidden_size * hidden_size for the q_proj and k_proj each
+        - 2 * hidden_size * hidden_size for the v_proj and o_proj each
+        - Others are ignorably small.
+        - In total = 1 * 2 + 2 * 2 = 6 * hidden_size * hidden_size
+    Args:
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 2048.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 2.0.
+        head_dim (int, Optional):
+            The dimension of each head. Default: 256.
+        num_heads (int, Optional):
+            The number of heads. Default: 4.
+        mode (str, Optional):
+            Which Gated DeltaNet kernel to use.
+            Currently available: `chunk` and `fused_recurrent`.
+            Default: `chunk`.
+        use_beta (bool, Optional):
+            Whether to use beta. Default: `True`.
+        use_gate (bool, Optional):
+            Whether to use output gate. Default: `True`.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `True`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+        norm_eps (float, Optional):
+            The epsilon value for the normalization layer. Default: 1e-5.
+    """
+    def __init__(
+        self,
+        hidden_size: int = 2048,
+        expand_v: float = 1,
+        head_dim: int = 256,
+        num_heads: int = 6,
+        num_heads_delta: int = 6,
+        mode: str = 'chunk',
+        use_gate: bool = True,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        layer_idx: int = None,
+        norm_eps: float = 1e-5,
+        rope_theta: float = 10000.,
+        max_position_embeddings: int = None,
+        window_size: int = None,
+        concept_dim: int = 128,
+        **kwargs: Unpack[Dict]
+    ) -> Task_Aware_Delta_Net:
+        super().__init__()
+        self.split_size = 64 # 64
+        self.mode = mode
+        self.hidden_size = hidden_size
+        self.expand_v = expand_v
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        # self.use_short_conv = False
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.head_dim = head_dim
+        self.strict_head = False
+        if self.strict_head:
+            head_dim_delta = int (0.75 * hidden_size / num_heads_delta)
+            head_dim = head_dim_delta
+            self.head_dim_delta = head_dim_delta
+            self.head_dim = head_dim_delta
+        self.num_heads = num_heads
+        self.key_dim = self.num_heads * self.head_dim
+        self.value_dim = self.key_dim * self.expand_v
+        self.head_qk_dim = head_dim
+        self.head_v_dim = head_dim * self.expand_v
+        self.layer_idx = layer_idx
+        self.silu = nn.SiLU()
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.b_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        self.a_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        A = torch.empty(self.num_heads, dtype=torch.float32).uniform_(0, 16)
+        A_log = torch.log(A)
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # self.D = nn.Parameter(torch.ones(self.num_heads))
+        # self.D._no_weight_decay = True
+        # hard coded for now
+        dt_min = 0.001
+        dt_max = 0.1
+        dt_init_floor = 1e-4
+        dt = torch.exp(
+            torch.rand(self.num_heads) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        )
+        dt = torch.clamp(dt, min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        self.dt_bias = nn.Parameter(inv_dt)
+        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+        # name.endswith("bias") in param_grouping.py
+        self.dt_bias._no_weight_decay = True
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+            self.k_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+            self.v_conv1d = ShortConvolution(
+                hidden_size=self.value_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+        else:
+            raise UserWarning(
+                "ShortConvolution is crucial to the performance. "
+                "Do not turn it off, i.e., setting `use_short_conv=False` unless you know what you are doing."
+            )
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormSwishGate(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.num_prelude = 2
+        self.ttt = True
+        if self.ttt and self.layer_idx >= self.num_prelude: # use TTT as cross-layer concept learner
+            self.concept_dim = concept_dim  # hidden_size // 8
+            self.concept_proj = nn.Linear(hidden_size, self.concept_dim * 3, bias=False)
+            self.lr1_proj = nn.Linear(hidden_size, 1, bias=False)
+            self.lr2_proj = nn.Linear(hidden_size, 1, bias=False)
+            # self.router  = nn.Linear(hidden_size, self.num_heads * 2, bias=False) # , bias=False
+            self.router2 = nn.Linear(self.concept_dim, self.num_heads * 2, bias=False)
+            self.router3 = nn.Linear(self.concept_dim, 2, bias=False)
+            self.condition_interpolation = condition_interpolation(hidden_size, concept_dim)
+            self.t_proj = nn.Linear(concept_dim, 1, bias=False)
+            # self.num_basis = 2
+            # self.basis_proj = nn.Linear(self.concept_dim, self.num_basis * 3, bias=False)
+            self.special_mask = nn.Parameter(torch.zeros(self.hidden_size))
+            # self.special_mask_gated_delta = nn.Parameter(torch.zeros(self.hidden_size))
+            self.use_bias = True
+            if self.use_bias:
+                self.learnable_bias0 = nn.Parameter(torch.zeros(1))
+        self.apply(self._initialize_weights)
+        # Initialize LoRA matrices for q, k, v, and o projections using nn.Sequential
+        self.r = 4
+        self.q_lora = nn.Sequential(
+            nn.Linear(self.hidden_size, self.key_dim // self.r, bias=False),
+            nn.SiLU(),
+            nn.Linear(self.key_dim // self.r, self.key_dim, bias=False)
+        )
+        nn.init.xavier_uniform_(self.q_lora[0].weight)
+        nn.init.zeros_(self.q_lora[2].weight)
+        self.k_lora = nn.Sequential(
+            nn.Linear(self.hidden_size, self.key_dim // self.r, bias=False),
+            nn.SiLU(),
+            nn.Linear(self.key_dim // self.r, self.key_dim, bias=False)
+        )
+        nn.init.xavier_uniform_(self.k_lora[0].weight)
+        nn.init.zeros_(self.k_lora[2].weight)
+        self.v_lora = nn.Sequential(
+            nn.Linear(self.hidden_size, self.value_dim // self.r, bias=False),
+            nn.SiLU(),
+            nn.Linear(self.value_dim // self.r, self.value_dim, bias=False)
+        )
+        nn.init.xavier_uniform_(self.v_lora[0].weight)
+        nn.init.zeros_(self.v_lora[2].weight)
+        self.o_proj_attn = nn.Linear(self.value_dim, self.hidden_size, bias=False)
+        nn.init.xavier_uniform_(self.o_proj_attn.weight, gain=2 ** -2.5)
+        # self.o_proj_attention = nn.Linear(self.value_dim, self.hidden_size, bias=False)
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.rotary = RotaryEmbedding(dim=self.head_dim, base=self.rope_theta)
+        self.window_size = window_size
+    def _initialize_weights(self, module: nn.Module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        module._is_hf_initialized = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values1: Optional[Cache] = None,
+        all_past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        rnn_router: Optional[nn.Module] = None,
+        h_old: Optional[torch.Tensor] = None,
+        params: Optional[Dict] = None,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        # output: return o, None, past_key_values1, past_key_values2, h_new, params
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        # # mode = self.mode
+        # mode = 'chunk'
+        if self.training:
+            assert mode == 'chunk', "Only chunk mode is supported in training."
+        last_state2 = None
+        if all_past_key_values is not None:
+            if all_past_key_values._seen_tokens > 0:
+                past_key_values1, past_key_values2 = all_past_key_values
+            else:
+                from fla.models.utils import Cache
+                past_key_values1, past_key_values2 = Cache(), Cache()
+            if len(past_key_values2) > self.layer_idx:
+                last_state2 = past_key_values2[self.layer_idx]
+        batch_size, q_len, _ = hidden_states.size()
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        max_seqlen = kwargs.get('max_seqlen', q_len)
+        if self.ttt:
+            flag = True
+            if self.layer_idx < self.num_prelude:  # 前2层 (0-1)
+                if flag == True:
+                    params = rnn_router.init_params_as_logits(batch_size, q_len)
+                    flag = False
+                mask = torch.ones(batch_size, q_len, self.num_heads, 2, device=hidden_states.device).to(hidden_states.dtype)
+                h_new = None
+                special_mask_attn = torch.zeros(batch_size, q_len, 1, device=hidden_states.device).to(hidden_states.dtype)
+            else:
+                concept_qkv = self.concept_proj(hidden_states)
+                concept_q, concept_k, concept_v = concept_qkv.chunk(3, dim=-1)
+                lr_linear = F.sigmoid(self.lr1_proj(hidden_states)) * 1e-2
+                lr_ln = F.sigmoid(self.lr2_proj(hidden_states)) * 1e-2
+                # lr_linear = 1e-2
+                # lr_ln = 1e-2
+                if rnn_router is not None:
+                    params = rnn_router.learn(concept_k, concept_v, params, lr_linear, lr_ln)
+                h_new = rnn_router.predict(concept_q, params)
+                t = F.sigmoid(self.t_proj(h_new))
+                t_b = 1 - t
+                input_router = self.router2(h_new)
+                # input_router = nn.Softmax(dim=-1)(input_router) # [batch_size, seq_len, head_dim, 2]
+                input_router = F.sigmoid(input_router) # [batch_size, seq_len, head_dim * 2]
+                special_mask = self.router3(h_new)
+                # 添加偏置使第一个位置更容易被选中（通过增加第一个位置的logits值）
+                bias = torch.zeros_like(special_mask)
+                bias[..., 0] = 2.0
+                if self.use_bias:
+                    bias[..., 0] = 2.0 + self.learnable_bias0  # 给第0个位置添加正偏置，使第一个位置更容易被选为0
+                special_mask = F.gumbel_softmax(special_mask + bias, tau=0.1, hard=True)
+                special_mask_attn = special_mask[:, :, 1].unsqueeze(-1) # [batch_size, seq_len, 1]
+                mask = input_router
+                mask = mask.reshape(batch_size, q_len, self.num_heads, 2)
+        # if self.layer_idx >= self.num_prelude:
+        #     hidden_states = hidden_states + special_mask_gated_delta * self.special_mask_gated_delta.reshape(1, 1, -1)
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state2 is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state2['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            # position_ids = kwargs.get('position_ids', None)
+            q_shared = self.q_proj(hidden_states)
+            k_shared = self.k_proj(hidden_states)
+            v_shared = self.v_proj(hidden_states)
+            q, conv_state_q = self.q_conv1d(x=q_shared,
+                                            mask=conv_mask,
+                                            cache=conv_state_q,
+                                            output_final_state=use_cache,
+                                            cu_seqlens = cu_seqlens
+                                            )
+            k, conv_state_k = self.k_conv1d(x=k_shared,
+                                            mask=conv_mask,
+                                            cache=conv_state_k,
+                                            output_final_state=use_cache,
+                                            cu_seqlens = cu_seqlens
+                                            )
+            v, conv_state_v = self.v_conv1d(x=v_shared,
+                                            mask=conv_mask,
+                                            cache=conv_state_v,
+                                            output_final_state=use_cache,
+                                            cu_seqlens = cu_seqlens
+                                            )
+        else:
+            q = self.silu(self.q_proj(hidden_states))
+            k = self.silu(self.k_proj(hidden_states))
+            v = self.silu(self.v_proj(hidden_states))
+        if self.layer_idx >= self.num_prelude:
+            hidden_states_attn = hidden_states + special_mask_attn * self.special_mask.reshape(1, 1, -1)
+        else:
+            hidden_states_attn = hidden_states
+        q_attn = self.q_lora(hidden_states_attn) + q_shared
+        k_attn = self.k_lora(hidden_states_attn) + k_shared
+        v_attn = self.v_lora(hidden_states_attn) + v_shared
+        # q_attn = input_router[:, :, 1].unsqueeze(-1) * q_attn
+        q_attn, k_attn, v_attn = map(lambda x: rearrange(x, 'b t (h d) -> b t h d', h=self.num_heads), (q_attn, k_attn, v_attn))
+        # equivalent to cu_seqlens in `flash_attn`
+        seqlen_offset = 0
+        # seqlen_offset, max_seqlen = 0, q_len
+        if all_past_key_values is not None:
+            seqlen_offset = past_key_values1.get_seq_length(self.layer_idx)
+            max_seqlen = q_attn.shape[1] + seqlen_offset
+            if attention_mask is not None:
+                # to deliminate the offsets of padding tokens
+                seqlen_offset = (seqlen_offset + attention_mask.sum(-1) - attention_mask.shape[-1]).clamp(min=0)
+                max_seqlen = q_attn.shape[1] + max(seqlen_offset)
+        if self.max_position_embeddings is not None:
+            max_seqlen_rotary = max(max_seqlen, self.max_position_embeddings)
+        else:
+            max_seqlen_rotary = max_seqlen
+        q_attn, k_attn = self.rotary(q_attn, k_attn, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen_rotary, cu_seqlens=cu_seqlens)
+        if all_past_key_values is not None:
+            k_attn, v_attn = past_key_values1.update(
+                attn_state=(k_attn.flatten(-2, -1), v_attn.flatten(-2, -1)),
+                layer_idx=self.layer_idx,
+                offset=q_len,
+                cache_kwargs=dict(window_size=self.window_size)
+            )['attn_state']
+            k_attn = rearrange(k_attn, '... (h d) -> ... h d', h=self.num_heads)
+            v_attn = rearrange(v_attn, '... (h d) -> ... h d', h=self.num_heads)
+        if flash_attn_func is None:
+            raise ImportError("Please install Flash Attention via `pip install flash-attn --no-build-isolation` first")
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            q_attn, k_attn, v_attn, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(q_attn, k_attn, v_attn, attention_mask, q_len)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_q, max_seqlen_k = max_seq_lens
+            o_attn = flash_attn_varlen_func(
+                q_attn, k_attn, v_attn,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            )
+            o_attn = pad_input(o_attn, indices_q, batch_size, q_len)
+        elif cu_seqlens is not None:
+            o_attn = flash_attn_varlen_func(
+                q_attn.squeeze(0), k_attn.squeeze(0), v_attn.squeeze(0),
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            ).unsqueeze(0)
+        else:
+            o_attn = flash_attn_func(
+                q_attn, k_attn, v_attn,
+                causal=True,
+                window_size=(-1, -1) if self.window_size is None else (self.window_size-1, 0)
+            ) # [total, num_heads, head_dim]   (total = batch_size * seq_len)
+        if batch_size > 1:
+            o_attn = o_attn.reshape(batch_size, q_len, self.num_heads, self.head_dim)
+        if self.layer_idx >= self.num_prelude:
+            o_attn = torch.einsum("bnh,bnhd->bnhd", mask[:, :, :, 0], o_attn) # [batch_size, seq_len, num_heads, head_dim]
+        o_attn = o_attn.reshape(batch_size, q_len, self.value_dim)
+        # o_attn = self.o_proj_attention(o_attn)
+        o_attn = self.o_proj_attn(o_attn) # + self.o_proj(o_attn)
+        #################################################### end of attention ####################################################
+        k, v = map(lambda x: rearrange(x, 'b t (h d) -> b t h d', h=self.num_heads), (k, v))
+        beta = self.b_proj(hidden_states).sigmoid()
+        g = -self.A_log.float().exp() * F.softplus(self.a_proj(hidden_states).float() + self.dt_bias)
+        # dealing with padding
+        if attention_mask is not None:
+            beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
+            g = g.mul(attention_mask[:, -g.shape[-2]:, None])
+        recurrent_state = last_state2['recurrent_state'] if last_state2 is not None else None
+        # if self.layer_idx >= self.num_prelude:
+        #     # q_plus_feature = q.clone()
+        #     q_safe_exp = safe_exp(q)
+        #     q_plus_feature = q + q_safe_exp * if_feature_map
+        #     # q_random_feature = random_proj(q, self.down_proj_matrix, self.up_proj_matrix, control_vec)
+        #     # q_plus_feature = q_plus_feature + q_random_feature * if_feature_map2
+        #     q_lora = lora_proj(q, self.down_proj_matrix, self.up_proj_matrix, torch.ones_like(control_vec)) # F.sigmoid(control_vec))  # F.sigmoid(control_vec)
+        #     q_gaussian_feature = gaussian_basis(q_lora, basis_a, basis_c, basis_h)
+        #     q_plus_feature = q_plus_feature + q_gaussian_feature * if_feature_map3
+        #     q = q_plus_feature
+        q = rearrange(q, 'b t (h d) -> b t h d', h=self.num_heads)
+        if mode == 'chunk':
+            o, recurrent_state = chunk_gated_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                head_first=False,
+                use_qk_l2norm_in_kernel=True
+            )
+        elif mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_gated_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                # head_first=False,
+                use_qk_l2norm_in_kernel=True
+            )
+        if all_past_key_values is not None:
+            past_key_values2.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_gate:
+            g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', h=self.num_heads)
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        if self.layer_idx >= self.num_prelude:
+            o = torch.einsum("bnh,bnhd->bnhd", mask[:, :, :, 1], o) # [batch_size, seq_len, num_heads, head_dim]
+        o_gated_delta = rearrange(o, 'b t h d -> b t (h d)')
+        o_gated_delta = self.o_proj(o_gated_delta)
+        #################################################### end of delta rule ####################################################
+        if self.layer_idx < self.num_prelude:
+            o = o_gated_delta + o_attn
+        else:
+            o = t_b * o_gated_delta + t * o_attn
+            noise_std = t_b * t
+            noise = self.condition_interpolation(o_gated_delta, o_attn, h_new) * noise_std
+            o = o + noise
+        if all_past_key_values is not None:
+            all_past_key_values = (past_key_values1, past_key_values2)
+        return o, None, None, all_past_key_values, h_new, params
+    def _upad_input(self, q, k, v, attention_mask, q_len):
+        seqlens = attention_mask.sum(-1, dtype=torch.int32)
+        indices_k = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        max_seqlen_k = seqlens.max().item()
+        cu_seqlens_k = F.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+        batch_size, seq_len, num_key_value_heads, head_dim = k.shape
+        k = index_first_axis(k.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        v = index_first_axis(v.reshape(batch_size * seq_len, num_key_value_heads, head_dim), indices_k)
+        if q_len == seq_len:
+            q = index_first_axis(q.reshape(batch_size * seq_len, self.num_heads, head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_q = max_seqlen_k
+            indices_q = indices_k
+        elif q_len == 1:
+            max_seqlen_q = 1
+            # There is a memcpy here, that is very bad.
+            cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32, device=q.device)
+            indices_q = cu_seqlens_q[:-1]
+            q = q.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -q_len:]
+            q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask)
+        return q, k, v, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k)
+if __name__ == "__main__":
+    gated_delta_net_attention = Task_Aware_Delta_Net()
+    q = torch.randn(1, 10, 6, 256)
+    k = torch.randn(1, 10, 6, 256)
+    v = torch.randn(1, 10, 6, 256)
+    print(q.shape, k.shape, v.shape)
+    # 调用forward函数
+    o, _, _, _ = gated_delta_net_attention.forward(hidden_states=torch.randn(2, 70, 128))
+    print(o.shape)

ttt_cross_layer.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def scan(f, init, xs, out, checkpoint_group=0):
+    """
+    模拟JAX中的lax.scan函数，用于序列化处理数据。
+    参数:
+        f: 处理函数，接收(carry, x)作为输入，返回(new_carry, y)
+        init: 初始状态值
+        xs: 输入序列，可以是字典或列表
+        out: 输出结果的存储张量
+        checkpoint_group: 梯度检查点分组数量，用于节省内存
+    返回:
+        carry: 最终的状态值
+        out: 填充好的输出张量
+    """
+    # 初始化状态值
+    carry = init
+    # 确定输入序列的长度
+    if isinstance(xs, dict):
+        # 如果输入是字典，取第一个键对应值的长度
+        num_items = len(next(iter(xs.values())))
+    else:
+        # 如果输入是列表，取第一个元素的长度
+        num_items = len(xs[0])
+    def scan_fn(carry, i_start, i_end):
+        """内部扫描函数，处理从i_start到i_end的元素"""
+        for i in range(i_start, i_end):
+            # 提取当前位置的输入
+            if isinstance(xs, dict):
+                # 字典情况：创建包含每个键在位置i处值的新字典
+                x = {key: tensor[i] for key, tensor in xs.items()}
+            else:
+                # 列表情况：创建包含每个列表在位置i处值的新列表
+                x = [x[i] for x in xs]
+            # 调用处理函数f，获取新的状态和输出
+            carry, y = f(carry, x)
+            # 将输出存储到结果张量中
+            out[i] = y
+        # 返回最终状态
+        return carry
+    # 根据checkpoint_group决定是否使用梯度检查点
+    if checkpoint_group > 0:
+        # 计算每个检查点组包含的元素数量
+        ckpt_every_n = num_items // checkpoint_group
+        # 按组处理数据
+        for k in range(0, num_items, ckpt_every_n):
+            # 使用torch.utils.checkpoint节省内存
+            carry = torch.utils.checkpoint.checkpoint(
+                scan_fn, carry, k, min(k + ckpt_every_n, num_items), use_reentrant=False
+            )
+    else:
+        # 不使用检查点，直接处理所有数据
+        carry = scan_fn(carry, 0, num_items)
+    # 返回最终状态和填充好的输出张量
+    return carry, out
+def ln_fwd(x, gamma, beta, eps=1e-6):
+    "Batch forward for LayerNorm."
+    # Mean and variance computation
+    mu = x.mean(dim=-1, keepdim=True)
+    var = x.var(dim=-1, keepdim=True, unbiased=False)
+    # Normalization
+    std = torch.sqrt(var + eps)
+    x_hat = (x - mu) / std
+    # Scale and shift
+    y = gamma * x_hat + beta
+    return y
+def ln_fused_l2_bwd(x, l2_target, gamma, beta, eps=1e-6):
+    """
+    层归一化(LayerNorm)与L2损失融合的反向传播函数。
+    这个函数执行两个操作：
+    1. 前向传播：对输入x进行层归一化，得到输出y
+    2. 反向传播：计算L2损失(y - l2_target)对输入x的梯度
+    参数:
+        x: 输入张量
+        l2_target: L2损失的目标值
+        gamma: 层归一化的缩放参数
+        beta: 层归一化的偏移参数
+        eps: 数值稳定性的小常数
+    返回:
+        z: 损失对输入x的梯度
+    """
+    D = x.shape[-1]  # 获取特征维度
+    # 计算均值和方差
+    mu = x.mean(dim=-1, keepdim=True)  # 沿特征维度计算均值
+    var = x.var(dim=-1, keepdim=True, unbiased=False)  # 计算方差
+    # 归一化处理
+    std = torch.sqrt(var + eps)  # 计算标准差
+    x_hat = (x - mu) / std  # 标准化输入
+    # 缩放和偏移
+    y = gamma * x_hat + beta  # 层归一化的输出
+    # 计算梯度
+    grad_output = y - l2_target  # L2损失的梯度
+    grad_x_hat = grad_output * gamma  # 对标准化输入的梯度
+    # 完整的反向传播公式，考虑了归一化操作的链式法则
+    z = (
+        (1.0 / D)
+        * (
+            D * grad_x_hat
+            - grad_x_hat.sum(dim=-1, keepdim=True)  # 均值项的梯度贡献
+            - x_hat * (grad_x_hat * x_hat).sum(dim=-1, keepdim=True)  # 方差项的梯度贡献
+        )
+        / std  # 除以标准差完成梯度计算
+    )
+    return z
+from torch.autograd import Function
+class MyLinearFunction(Function):
+    @staticmethod
+    def forward(ctx, input, weight, bias):
+        """
+        正向计算： y = x * W^T + b
+        参数：
+            ctx    ：上下文对象，用于保存反向传播时需要的信息。
+            input  ：输入 tensor, 尺寸为 (N, in_features)
+            weight ：权重 tensor, 尺寸为 (out_features, in_features)
+            bias   ：偏置 tensor, 尺寸为 (out_features)
+        返回：
+            输出 tensor, 尺寸为 (N, out_features)
+        """
+        # 保存必要的中间变量，供 backward 时使用
+        ctx.save_for_backward(input, weight, bias)
+        # 计算输出
+        output = input.matmul(weight.t()) + bias
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        反向传播：计算正向计算中各个输入的梯度。
+        参数：
+            grad_output：从上层传回来的梯度，形状与 forward 的输出相同 (N, out_features)
+        返回：
+            grad_input  ：关于 input 的梯度，形状 (N, in_features)
+            grad_weight ：关于 weight 的梯度，形状 (out_features, in_features)
+            grad_bias   ：关于 bias 的梯度，形状 (out_features)
+        """
+        # 从上下文中取出保存的变量
+        input, weight, bias = ctx.saved_tensors
+        # 链式法则：已知 output = input.matmul(weight.t()) + bias
+        # 关于 input 的梯度：
+        # ∂L/∂input = ∂L/∂output * ∂output/∂input = grad_output.matmul(weight)
+        grad_input = grad_output.matmul(weight)
+        # 关于 weight 的梯度：
+        # ∂L/∂weight = ∂L/∂output^T * ∂output/∂weight
+        # 注意到 output 对 weight 的导数为 input 的转置，此处：
+        # grad_weight 的计算通常为：grad_output^T.matmul(input)
+        grad_weight = grad_output.t().matmul(input)
+        # 关于 bias 的梯度：
+        # 因为 output = ... + bias，因此每个 bias 项对应所有样本的梯度和
+        grad_bias = grad_output.sum(dim=0)
+        # 注意：返回的梯度顺序必须与 forward 中参数的顺序一致
+        return grad_input, grad_weight, grad_bias
+class TTT_Cross_Layer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.input_size = config.concept_dim   # 128
+        self.concept_dim = config.concept_dim  # 128
+        # self.linear = nn.Linear(self.input_size, self.hidden_size)
+        # self.ln = nn.LayerNorm(self.hidden_size)
+        # self.logit_dim = 32
+        self.logit_dim = config.logit_dim
+        self.weight_linear = nn.Parameter(torch.empty(self.concept_dim, self.input_size, self.logit_dim))
+        self.weight_ln = nn.Parameter(torch.empty(self.concept_dim, self.logit_dim))
+        self.bias_linear = nn.Parameter(torch.empty(self.concept_dim, self.logit_dim))
+        self.bias_ln = nn.Parameter(torch.empty(self.concept_dim, self.logit_dim))
+        # self.weight_linear_tmp = torch.empty_like(self.weight_linear)
+        # self.weight_ln_tmp = torch.empty_like(self.weight_ln)
+        # self.bias_linear_tmp = torch.empty_like(self.bias_linear)
+        # self.bias_ln_tmp = torch.empty_like(self.bias_ln)
+        self.config = config
+        self.init_weights()
+    # def init_tmp_weights(self):
+    #     weight_linear_tmp = self.weight_linear.clone().to(self.weight_linear.device).to(self.weight_linear.dtype)
+    #     weight_ln_tmp = self.weight_ln.clone().to(self.weight_ln.device).to(self.weight_ln.dtype)
+    #     bias_linear_tmp = self.bias_linear.clone().to(self.bias_linear.device).to(self.bias_linear.dtype)
+    #     bias_ln_tmp = self.bias_ln.clone().to(self.bias_ln.device).to(self.bias_ln.dtype)
+    #     params = {
+    #         'weight_linear_tmp': weight_linear_tmp,
+    #         'weight_ln_tmp': weight_ln_tmp,
+    #         'bias_linear_tmp': bias_linear_tmp,
+    #         'bias_ln_tmp': bias_ln_tmp
+    #     }
+    #     return params
+    def init_params_as_logits(self, batch_size, sequence_length):
+        weight_linear_tmp = torch.ones(batch_size, sequence_length, self.logit_dim).to(self.weight_linear.device).to(self.weight_linear.dtype)
+        weight_ln_tmp = torch.ones(batch_size, sequence_length, self.logit_dim).to(self.weight_linear.device).to(self.weight_linear.dtype)
+        bias_linear_tmp = torch.ones(batch_size, sequence_length, self.logit_dim).to(self.weight_linear.device).to(self.weight_linear.dtype)
+        bias_ln_tmp = torch.ones(batch_size, sequence_length, self.logit_dim).to(self.weight_linear.device).to(self.weight_linear.dtype)
+        params = {
+            'weight_linear_tmp': weight_linear_tmp,
+            'weight_ln_tmp': weight_ln_tmp,
+            'bias_linear_tmp': bias_linear_tmp,
+            'bias_ln_tmp': bias_ln_tmp
+        }
+        return params
+    def init_weights(self):
+        # torch.manual_seed(42)  # 固定随机种子可能导致可预测性
+        nn.init.normal_(self.weight_linear, mean=0.0, std=self.config.initializer_range)
+        nn.init._no_grad_fill_(self.weight_ln, 1.0 / self.logit_dim)
+        # nn.init.zeros_(self.bias_linear)
+        # nn.init.zeros_(self.bias_ln)
+        nn.init.normal_(self.bias_linear, mean=0.0, std=self.config.initializer_range / self.logit_dim)
+        nn.init.normal_(self.bias_linear, mean=0.0, std=self.config.initializer_range / self.logit_dim)
+    def get_weight_per_token(self, params):
+        weight_linear_tmp = torch.einsum('iol,bsl->bsio', self.weight_linear, params['weight_linear_tmp'])
+        weight_ln_tmp = torch.einsum('ol,bsl->bso', self.weight_ln, params['weight_ln_tmp'])
+        bias_linear_tmp = torch.einsum('ol,bsl->bso', self.bias_linear, params['bias_linear_tmp'])
+        bias_ln_tmp = torch.einsum('ol,bsl->bso', self.bias_ln, params['bias_ln_tmp'])
+        return weight_linear_tmp, weight_ln_tmp, bias_linear_tmp, bias_ln_tmp
+    def learn(self, k, v, params, lr_linear=1, lr_ln=1):
+        # k和v形状: [batch_size, length, channel_dim]
+        # batch_size, seq_length, channel_dim = k.shape
+        # weight_linear_tmp = params['weight_linear_tmp']
+        # weight_ln_tmp = params['weight_ln_tmp']
+        # bias_linear_tmp = params['bias_linear_tmp']
+        # bias_ln_tmp = params['bias_ln_tmp']
+        weight_linear_tmp, weight_ln_tmp, bias_linear_tmp, bias_ln_tmp = self.get_weight_per_token(params)
+        # 1. 将输入重塑为二维以进行预测
+        # k_reshaped = k.reshape(-1, channel_dim)  # [batch_size*length, channel_dim]
+        # output_reshaped = self.predict(k_reshaped, params)  # [batch_size*length, channel_dim]
+        # z = F.linear(k_reshaped, params['weight_linear_tmp'], params['bias_linear_tmp'])
+        # mu = z.mean(dim=-1, keepdim=True)
+        # var = z.var(dim=-1, keepdim=True, unbiased=False)
+        z = torch.einsum('bsi,bsio->bso', k, weight_linear_tmp) + bias_linear_tmp
+        mu = z.mean(dim=-1, keepdim=True)
+        var = z.var(dim=-1, keepdim=True, unbiased=False)
+        # Normalization
+        eps = 1e-6
+        std = torch.sqrt(var + eps)
+        z_hat = (z - mu) / std
+        # output_reshaped = params['weight_ln_tmp'] * z_hat + params['bias_ln_tmp'] + k
+        output_reshaped = weight_ln_tmp * z_hat + bias_ln_tmp + k
+        # # 计算误差
+        # v_reshaped = v.reshape(-1, channel_dim)
+        # error_reshaped = output_reshaped - v_reshaped  # [batch_size*length, channel_dim]
+        error_reshaped = output_reshaped - v
+        # 计算层归一化梯度
+        # 层归一化参数更新
+        # ln_rate = learning_rate * 0.1  # 降低LN学习率
+        grad_weight_ln_temp = error_reshaped * z_hat
+        # grad_weight_ln = grad_weight_ln_temp.mean(dim=0) #
+        # weight_ln_tmp = weight_ln_tmp - ln_rate * grad_weight_ln # sequence length, channel_dim
+        grad_weight_ln = grad_weight_ln_temp
+        # batch_size, sequence length, logit_dim
+        params0 = params['weight_ln_tmp'] - lr_ln * torch.einsum('ol,bso->bsl', self.weight_ln, grad_weight_ln)
+        # bias_update = ln_rate * error_reshaped # .mean(dim=0)
+        # bias_ln_tmp = bias_ln_tmp - bias_update # batch_size, sequence length, concept_dim
+        grad_bias_ln = error_reshaped
+        params1 = params['bias_ln_tmp'] - lr_ln * torch.einsum('ol,bso->bsl', self.bias_ln, grad_bias_ln)
+        # 线性层权重梯度: [out_dim, in_dim]
+        # grad_linear_temp = error_reshaped - error_reshaped.mean(dim=-1, keepdim=True) - z_hat * grad_weight_ln_temp.mean(dim=-1, keepdim=True)
+        grad_linear = weight_ln_tmp * error_reshaped / std # batch_size, sequence length, concept_dim
+        # grad_weight_linear = grad_linear.t() @ k  # [channel_dim, channel_dim]
+        grad_weight_linear = torch.einsum('bsi,bso->bsio', k, grad_linear)
+        # 应用梯度 (避免使用原地操作 -=)
+        # weight_linear_tmp = weight_linear_tmp - learning_rate * grad_weight_linear.mean(dim=0)
+        params2 = params['weight_linear_tmp'] - lr_linear * torch.einsum('iol,bsio->bsl', self.weight_linear, grad_weight_linear)
+        # 更新偏置(如果存在) (避免使用原地操作 -=)
+        grad_b = grad_linear #.mean(dim=0)  # [channel_dim]
+        # bias_linear_tmp = bias_linear_tmp - learning_rate * grad_b
+        params3 = params['bias_linear_tmp'] - lr_linear * torch.einsum('ol,bso->bsl', self.bias_linear, grad_b)
+        params_new = {
+            'weight_linear_tmp': params2,
+            'weight_ln_tmp': params0,
+            'bias_linear_tmp': params3,
+            'bias_ln_tmp': params1
+        }
+        return params_new
+    def predict(self, q, params):
+        weight_linear_tmp, weight_ln_tmp, bias_linear_tmp, bias_ln_tmp = self.get_weight_per_token(params)
+        z = torch.einsum('bsi,bsio->bso', q, weight_linear_tmp) + bias_linear_tmp
+        mu = z.mean(dim=-1, keepdim=True)
+        var = z.var(dim=-1, keepdim=True, unbiased=False)
+        # Normalization
+        eps = 1e-6
+        std = torch.sqrt(var + eps)
+        z_hat = (z - mu) / std
+        # output_reshaped = params['weight_ln_tmp'] * z_hat + params['bias_ln_tmp'] + k
+        output = weight_ln_tmp * z_hat + bias_ln_tmp + q
+        return output