Update modeling_nemotron_h.py (#36)
Browse files- Update modeling_nemotron_h.py (5a256b5fb7a1c2cbc0a04546c98bc6eb66cec8ee)
- modeling_nemotron_h.py +4 -0
modeling_nemotron_h.py
CHANGED
|
@@ -1117,6 +1117,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
|
|
| 1117 |
def _init_weights(self, module):
|
| 1118 |
"""Initialize the weights."""
|
| 1119 |
if isinstance(module, NemotronHMamba2Mixer):
|
|
|
|
|
|
|
| 1120 |
module.A_log._no_weight_decay = True
|
| 1121 |
module.D._no_weight_decay = True
|
| 1122 |
|
|
@@ -1148,6 +1150,8 @@ class NemotronHPreTrainedModel(PreTrainedModel):
|
|
| 1148 |
#
|
| 1149 |
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
| 1150 |
for name, p in module.named_parameters():
|
|
|
|
|
|
|
| 1151 |
if name in ["out_proj.weight"]:
|
| 1152 |
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
| 1153 |
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|
|
|
|
| 1117 |
def _init_weights(self, module):
|
| 1118 |
"""Initialize the weights."""
|
| 1119 |
if isinstance(module, NemotronHMamba2Mixer):
|
| 1120 |
+
if getattr(module.dt_bias, "_is_hf_initialized", False):
|
| 1121 |
+
return
|
| 1122 |
module.A_log._no_weight_decay = True
|
| 1123 |
module.D._no_weight_decay = True
|
| 1124 |
|
|
|
|
| 1150 |
#
|
| 1151 |
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
|
| 1152 |
for name, p in module.named_parameters():
|
| 1153 |
+
if getattr(p, "_is_hf_initialized", False):
|
| 1154 |
+
continue
|
| 1155 |
if name in ["out_proj.weight"]:
|
| 1156 |
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
|
| 1157 |
# Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
|