nvidia
/

NVIDIA-Nemotron-Nano-9B-v2

@@ -1117,6 +1117,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights."""
         if isinstance(module, NemotronHMamba2Mixer):
             module.A_log._no_weight_decay = True
             module.D._no_weight_decay = True
@@ -1148,6 +1150,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
             #
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
                 if name in ["out_proj.weight"]:
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)

     def _init_weights(self, module):
         """Initialize the weights."""
         if isinstance(module, NemotronHMamba2Mixer):
+            if getattr(module.dt_bias, "_is_hf_initialized", False):
+                return
             module.A_log._no_weight_decay = True
             module.D._no_weight_decay = True
             #
             # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
             for name, p in module.named_parameters():
+                if getattr(p, "_is_hf_initialized", False):
+                    continue
                 if name in ["out_proj.weight"]:
                     # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
                     # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)