nvidia
/

NVIDIA-Nemotron-3-Nano-30B-A3B-BF16

Text Generation

Model card Files Files and versions

suhara commited on 14 days ago

Commit

0d1698d

·

verified ·

1 Parent(s): f6aca92

Upload modeling_nemotron_h.py

Files changed (1) hide show

modeling_nemotron_h.py +4 -0

modeling_nemotron_h.py CHANGED Viewed

@@ -850,6 +850,10 @@ class NemotronHMOE(nn.Module):
                 expert_output = expert(expert_input)
                 weighted_output = expert_output * expert_weights.unsqueeze(-1)
                 final_hidden_states.index_add_(0, token_indices, weighted_output)
         # in original deepseek, the output of the experts are gathered once we leave this module
         # thus the moe module is itelsf an IsolatedParallel module

                 expert_output = expert(expert_input)
                 weighted_output = expert_output * expert_weights.unsqueeze(-1)
                 final_hidden_states.index_add_(0, token_indices, weighted_output)
+            else:
+                # Local empty expert: no-op compute that still marks params as used.
+                dummy_out = expert(torch.zeros_like(hidden_states[0]).unsqueeze(0).to(final_hidden_states.dtype))
+                final_hidden_states = final_hidden_states + dummy_out
         # in original deepseek, the output of the experts are gathered once we leave this module
         # thus the moe module is itelsf an IsolatedParallel module