Upload modeling_nemotron_h.py
Browse files- modeling_nemotron_h.py +4 -0
modeling_nemotron_h.py
CHANGED
|
@@ -850,6 +850,10 @@ class NemotronHMOE(nn.Module):
|
|
| 850 |
expert_output = expert(expert_input)
|
| 851 |
weighted_output = expert_output * expert_weights.unsqueeze(-1)
|
| 852 |
final_hidden_states.index_add_(0, token_indices, weighted_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
|
| 854 |
# in original deepseek, the output of the experts are gathered once we leave this module
|
| 855 |
# thus the moe module is itelsf an IsolatedParallel module
|
|
|
|
| 850 |
expert_output = expert(expert_input)
|
| 851 |
weighted_output = expert_output * expert_weights.unsqueeze(-1)
|
| 852 |
final_hidden_states.index_add_(0, token_indices, weighted_output)
|
| 853 |
+
else:
|
| 854 |
+
# Local empty expert: no-op compute that still marks params as used.
|
| 855 |
+
dummy_out = expert(torch.zeros_like(hidden_states[0]).unsqueeze(0).to(final_hidden_states.dtype))
|
| 856 |
+
final_hidden_states = final_hidden_states + dummy_out
|
| 857 |
|
| 858 |
# in original deepseek, the output of the experts are gathered once we leave this module
|
| 859 |
# thus the moe module is itelsf an IsolatedParallel module
|