suhara commited on
Commit
0d1698d
·
verified ·
1 Parent(s): f6aca92

Upload modeling_nemotron_h.py

Browse files
Files changed (1) hide show
  1. modeling_nemotron_h.py +4 -0
modeling_nemotron_h.py CHANGED
@@ -850,6 +850,10 @@ class NemotronHMOE(nn.Module):
850
  expert_output = expert(expert_input)
851
  weighted_output = expert_output * expert_weights.unsqueeze(-1)
852
  final_hidden_states.index_add_(0, token_indices, weighted_output)
 
 
 
 
853
 
854
  # in original deepseek, the output of the experts are gathered once we leave this module
855
  # thus the moe module is itelsf an IsolatedParallel module
 
850
  expert_output = expert(expert_input)
851
  weighted_output = expert_output * expert_weights.unsqueeze(-1)
852
  final_hidden_states.index_add_(0, token_indices, weighted_output)
853
+ else:
854
+ # Local empty expert: no-op compute that still marks params as used.
855
+ dummy_out = expert(torch.zeros_like(hidden_states[0]).unsqueeze(0).to(final_hidden_states.dtype))
856
+ final_hidden_states = final_hidden_states + dummy_out
857
 
858
  # in original deepseek, the output of the experts are gathered once we leave this module
859
  # thus the moe module is itelsf an IsolatedParallel module