Text Generation
Transformers
Safetensors
PyTorch
nemotron_h
nvidia
conversational
custom_code
dmax123 commited on
Commit
6533e8d
·
1 Parent(s): a4fb579

Update modeling_nemotron_h.py (#36)

Browse files

- Update modeling_nemotron_h.py (5a256b5fb7a1c2cbc0a04546c98bc6eb66cec8ee)

Files changed (1) hide show
  1. modeling_nemotron_h.py +4 -0
modeling_nemotron_h.py CHANGED
@@ -1117,6 +1117,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1117
  def _init_weights(self, module):
1118
  """Initialize the weights."""
1119
  if isinstance(module, NemotronHMamba2Mixer):
 
 
1120
  module.A_log._no_weight_decay = True
1121
  module.D._no_weight_decay = True
1122
 
@@ -1148,6 +1150,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1148
  #
1149
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1150
  for name, p in module.named_parameters():
 
 
1151
  if name in ["out_proj.weight"]:
1152
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1153
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
 
1117
  def _init_weights(self, module):
1118
  """Initialize the weights."""
1119
  if isinstance(module, NemotronHMamba2Mixer):
1120
+ if getattr(module.dt_bias, "_is_hf_initialized", False):
1121
+ return
1122
  module.A_log._no_weight_decay = True
1123
  module.D._no_weight_decay = True
1124
 
 
1150
  #
1151
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1152
  for name, p in module.named_parameters():
1153
+ if getattr(p, "_is_hf_initialized", False):
1154
+ continue
1155
  if name in ["out_proj.weight"]:
1156
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1157
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)