DiscoResearch
/

mixtral-7b-8expert

Text Generation

text-generation-inference

Model card Files Files and versions

Update modeling_moe_mistral.py

#5

by bjoernp - opened Dec 10, 2023

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

Files changed (2) hide show

config.json +2 -2
modeling_moe_mistral.py +2 -3

config.json CHANGED Viewed

@@ -3,8 +3,8 @@
     "MixtralForCausalLM"
   ],
   "auto_map": {
-    "AutoConfig": "DiscoResearch/mixtral-7b-8expert--configuration_moe_mistral.MixtralConfig",
-    "AutoModelForCausalLM": "DiscoResearch/mixtral-7b-8expert--modeling_moe_mistral.MixtralForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 1,

     "MixtralForCausalLM"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_moe_mistral.MixtralConfig",
+    "AutoModelForCausalLM": "modeling_moe_mistral.MixtralForCausalLM"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 1,

modeling_moe_mistral.py CHANGED Viewed

@@ -215,15 +215,14 @@ class MoE(nn.Module):
         orig_shape = x.shape
         x = x.view(-1, x.shape[-1])
-        scores = self.gate(x)
         expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_token, dim=-1)
-        expert_weights = expert_weights.softmax(dim=-1)
         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
         y = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
-            y[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
         y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
         return y.view(*orig_shape)

         orig_shape = x.shape
         x = x.view(-1, x.shape[-1])
+        scores = self.gate(x).softmax(dim=-1)
         expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_token, dim=-1)
         flat_expert_indices = expert_indices.view(-1)
         x = x.repeat_interleave(self.num_experts_per_token, dim=0)
         y = torch.empty_like(x)
         for i, expert in enumerate(self.experts):
+            y[flat_expert_indices == i] = expert(y[flat_expert_indices == i])
         y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
         return y.view(*orig_shape)