File size: 1,122 Bytes
9a7a74a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# Remote code: configuration and modeling for NSA
from transformers import PretrainedConfig


class NSAConfig(PretrainedConfig):
    model_type = "nsa"

    def __init__(
        self,
        vocab_size=50257,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        n_kv_groups=1,
        d_k=64,
        d_v=64,
        max_position_embeddings=2048,
        rope_theta=10000,
        nsa=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.n_kv_groups = n_kv_groups
        self.d_k = d_k
        self.d_v = d_v
        self.max_position_embeddings = max_position_embeddings
        self.rope_theta = rope_theta
        self.nsa = nsa or {
            "branches": ["cmp", "sel", "win"],
            "window": 512,
            "gqa_groups": n_kv_groups,
            "block": 32,
            "stride": 16,
            "sel_block": 64,
            "sel_top_n": 16,
        }