Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

config.json +5 -2
esm_nv.py +273 -107
model.safetensors +2 -2
tokenizer_config.json +0 -1

config.json CHANGED Viewed

@@ -1,5 +1,6 @@
 {
   "add_cross_attention": false,
   "architectures": [
     "NVEsmForMaskedLM"
   ],
@@ -26,6 +27,7 @@
   "is_decoder": false,
   "is_folding_model": false,
   "layer_norm_eps": 1e-05,
   "mask_token_id": 32,
   "max_position_embeddings": 1026,
   "max_seq_length": null,
@@ -34,13 +36,14 @@
   "num_attention_heads": 40,
   "num_hidden_layers": 36,
   "pad_token_id": 1,
-  "padded_vocab_size": 64,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "tie_word_embeddings": true,
   "token_dropout": true,
-  "transformers_version": "5.0.0",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33
 }

 {
   "add_cross_attention": false,
+  "add_pooling_layer": false,
   "architectures": [
     "NVEsmForMaskedLM"
   ],
   "is_decoder": false,
   "is_folding_model": false,
   "layer_norm_eps": 1e-05,
+  "layer_precision": null,
   "mask_token_id": 32,
   "max_position_embeddings": 1026,
   "max_seq_length": null,
   "num_attention_heads": 40,
   "num_hidden_layers": 36,
   "pad_token_id": 1,
+  "padded_vocab_size": 33,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "tie_word_embeddings": true,
   "token_dropout": true,
+  "transformers_version": "5.5.0",
   "use_cache": true,
+  "use_quantized_model_init": false,
   "vocab_list": null,
   "vocab_size": 33
 }

esm_nv.py CHANGED Viewed

@@ -22,11 +22,14 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
-from typing import ClassVar, Literal, Optional, Unpack
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
 import torch
 import transformer_engine.pytorch
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -70,6 +73,9 @@ class NVEsmConfig(EsmConfig):
         max_seq_length: Optional[int] = None,
         padded_vocab_size: Optional[int] = 64,
         attn_mask_type: str = "padding",
         **kwargs,
     ):
         """Initialize the NVEsmConfig with additional TE-related config options.
@@ -81,11 +87,10 @@ class NVEsmConfig(EsmConfig):
                 `v` weights for each attention head are interleaved. This parameter is set to `False`
                 when using :attr:`fuse_qkv_params=False`.
             encoder_activation: The activation function to use in the encoder.
-            attn_input_format: The input format to use for the attention. This controls
-                whether the dimensions of the intermediate hidden states is 'batch first'
-                ('bshd') or 'sequence first' ('sbhd'). `s` stands for the sequence length,
-                `b` batch size, `h` the number of heads, `d` head size. Note that these
-                formats are very closely related to the `qkv_format` in the
                 `MultiHeadAttention` and `DotProductAttention` modules.
             fuse_qkv_params: Whether to fuse the qkv parameters. If set to `True`,
                 `TransformerLayer` module exposes a single fused parameter for query-key-value.
@@ -100,6 +105,13 @@ class NVEsmConfig(EsmConfig):
             padded_vocab_size: The padded vocabulary size to support FP8. If not provided, defaults
                 to vocab_size. Must be greater than or equal to vocab_size.
             attn_mask_type: The type of attention mask to use.
             **kwargs: Additional config options to pass to EsmConfig.
         """
         super().__init__(**kwargs)
@@ -111,9 +123,12 @@ class NVEsmConfig(EsmConfig):
         self.micro_batch_size = micro_batch_size
         self.max_seq_length = max_seq_length
         self.attn_mask_type = attn_mask_type
         # Set padded_vocab_size with default fallback to vocab_size
-        self.padded_vocab_size = padded_vocab_size if padded_vocab_size is not None else self.vocab_size
         # Ensure padded_vocab_size is at least as large as vocab_size
         if self.padded_vocab_size is not None and self.vocab_size is not None:
@@ -121,50 +136,84 @@ class NVEsmConfig(EsmConfig):
                 f"padded_vocab_size ({self.padded_vocab_size}) must be greater than or equal to vocab_size ({self.vocab_size})"
             )
 class NVEsmEncoder(nn.Module):
     """NVEsmEncoder is a TransformerEngine-optimized ESM encoder."""
-    def __init__(self, config: NVEsmConfig):
         """Initialize a NVEsmEncoder.
         Args:
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__()
         self.config = config
         def _init_method(x):
             torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range)
-        self.layers = nn.ModuleList(
-            [
-                transformer_engine.pytorch.TransformerLayer(
-                    hidden_size=config.hidden_size,
-                    ffn_hidden_size=config.intermediate_size,
-                    num_attention_heads=config.num_attention_heads,
-                    layernorm_epsilon=config.layer_norm_eps,
-                    hidden_dropout=config.hidden_dropout_prob,
-                    attention_dropout=config.attention_probs_dropout_prob,
-                    qkv_weight_interleaved=config.qkv_weight_interleaved,
-                    layer_number=i + 1,
-                    layer_type="encoder",
-                    self_attn_mask_type=config.attn_mask_type,
-                    activation=config.encoder_activation,
-                    attn_input_format=config.attn_input_format,
-                    seq_length=config.max_seq_length,
-                    micro_batch_size=config.micro_batch_size,
-                    num_gqa_groups=config.num_attention_heads,
-                    fuse_qkv_params=config.fuse_qkv_params,
-                    params_dtype=config.dtype,
-                    window_size=(-1, -1),
-                    device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
-                    init_method=_init_method,
-                    output_layer_init_method=_init_method,
-                )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(
             config.hidden_size,
             eps=config.layer_norm_eps,
@@ -198,23 +247,27 @@ class NVEsmEncoder(nn.Module):
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
-        for layer_module in self.layers:
-            if kwargs.get("output_hidden_states", False):
-                all_hidden_states = (*all_hidden_states, hidden_states)
-            hidden_states = layer_module(
-                hidden_states,
-                attention_mask,
-                rotary_pos_emb=te_rope_emb,
-                cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
-                cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
-                cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
-                cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
-                max_seqlen_q=kwargs.get("max_length_q", None),
-                max_seqlen_kv=kwargs.get("max_length_k", None),
-                pad_between_seqs=kwargs.get("pad_between_seqs", None),
-            )
         hidden_states = self.emb_layer_norm_after(hidden_states)
@@ -223,15 +276,60 @@ class NVEsmEncoder(nn.Module):
         return BaseModelOutput(
             last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states if all_hidden_states else None,
         )
 class NVEsmPreTrainedModel(EsmPreTrainedModel):
     """An abstract class to handle weights initialization and pretrained model loading."""
     config_class = NVEsmConfig
-    base_model_prefix = "esm"
     supports_gradient_checkpointing = False
     accepts_loss_kwargs = False
     _no_split_modules = (
@@ -247,11 +345,11 @@ class NVEsmPreTrainedModel(EsmPreTrainedModel):
             if hasattr(module, "reset_parameters"):
                 module.reset_parameters()
-        # The esm.embeddings layer is the only non-TE layer in this model we need to deal with. We use
         # `model._init_weights` rather than `reset_parameters` to ensure we honor the original config standard
-        # deviation.
-        self.esm.embeddings.word_embeddings.to_empty(device="cuda")
-        self.esm.embeddings.apply(self._init_weights)
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
@@ -276,14 +374,16 @@ class NVEsmPreTrainedModel(EsmPreTrainedModel):
         super()._init_weights(module)
     def state_dict(self, *args, **kwargs):
-        """Override state_dict to filter out TransformerEngine's _extra_state keys.
-        TransformerEngine layers add _extra_state attributes that are not compatible with HuggingFace v5 model loading.
-        These are filtered out to ensure checkpoints can be loaded with from_pretrained().
         """
         state_dict = super().state_dict(*args, **kwargs)
-        # Filter out _extra_state keys which are TransformerEngine-specific and not loadable
-        return {k: v for k, v in state_dict.items() if not k.endswith("_extra_state")}
 class NVEsmModel(NVEsmPreTrainedModel):
@@ -292,21 +392,33 @@ class NVEsmModel(NVEsmPreTrainedModel):
     This model uses NVDIA's TransformerEngine to optimize attention layer training and inference.
     """
-    def __init__(self, config: NVEsmConfig, add_pooling_layer: bool = True):
         """Initialize a NVEsmModel.
         Args:
             config (NVEsmConfig): The configuration of the model.
-            add_pooling_layer (bool): Whether to add a pooling layer.
         """
         super().__init__(config)
         self.config = config
         # Ensure pad_token_id is set properly, defaulting to 0 if not specified
         if not hasattr(config, "pad_token_id") or config.pad_token_id is None:
             config.pad_token_id = 0
         self.embeddings = NVEsmEmbeddings(config)
-        self.encoder = NVEsmEncoder(config)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
         # Initialize weights and apply final processing
@@ -375,7 +487,7 @@ class NVEsmModel(NVEsmPreTrainedModel):
         )
         encoder_outputs = self.encoder(
             embedding_output,
-            attention_mask=extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
@@ -391,13 +503,23 @@ class NVEsmModel(NVEsmPreTrainedModel):
 class NVEsmForMaskedLM(NVEsmPreTrainedModel):
     """NVEsmForMaskedLM is a TransformerEngine-optimized ESM model for masked language modeling."""
-    _tied_weights_keys: ClassVar[dict[str, str]] = {"lm_head.decoder.weight": "esm.embeddings.word_embeddings.weight"}
-    def __init__(self, config: NVEsmConfig):
         """Initialize a NVEsmForMaskedLM.
         Args:
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__(config)
@@ -407,7 +529,7 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
                 "bi-directional self-attention."
             )
-        self.esm = NVEsmModel(config, add_pooling_layer=False)
         self.lm_head = NVEsmLMHead(config)
         self.post_init()
@@ -442,7 +564,7 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
         Returns:
             MaskedLMOutput: The output of the model.
         """
-        outputs = self.esm(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -450,7 +572,8 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
             **kwargs,
         )
         sequence_output = outputs[0]
-        prediction_scores = self.lm_head(sequence_output)
         # Truncate logits back to original vocab_size if padding was used
         if self.config.padded_vocab_size != self.config.vocab_size:
@@ -481,18 +604,18 @@ class NVEsmLMHead(nn.Module):
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__()
-        self.dense = transformer_engine.pytorch.Linear(
-            config.hidden_size,
-            config.hidden_size,
-            params_dtype=config.dtype,
-            device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
-            init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
-        )
-        with transformer_engine.pytorch.fp8_model_init(enabled=False):
             self.decoder = transformer_engine.pytorch.LayerNormLinear(
                 config.hidden_size,
-                config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
                 bias=True,
                 eps=config.layer_norm_eps,
                 params_dtype=config.dtype,
@@ -509,7 +632,7 @@ class NVEsmLMHead(nn.Module):
         """
         # Keep the last layers of the network in higher precision to avoid numerical instability.
         # Please see recipes/fp8_analysis/README.md for more details.
-        with transformer_engine.pytorch.fp8_autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)
             x = self.decoder(x)
@@ -550,6 +673,55 @@ class NVEsmEmbeddings(nn.Module):
         self.token_dropout = config.token_dropout
         self.mask_token_id = config.mask_token_id
     def forward(
         self,
         input_ids=None,
@@ -585,27 +757,10 @@ class NVEsmEmbeddings(nn.Module):
         # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
         if self.token_dropout and input_ids is not None:
             embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
-            mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
-            if not using_thd:
-                # BSHD token dropout correction
-                src_lengths = attention_mask.sum(-1) if attention_mask is not None else input_ids.shape[1]
-                n_masked_per_seq = (input_ids == self.mask_token_id).sum(-1).float()
-                mask_ratio_observed = n_masked_per_seq / src_lengths
-                scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
-                embeddings = (embeddings * scale_factor[:, None, None]).to(embeddings.dtype)
             else:
-                src_lengths = torch.diff(kwargs["cu_seq_lens_q"])
-                # We need to find the number of masked tokens in each sequence in the padded batch.
-                is_masked = (input_ids == self.mask_token_id).squeeze(0)
-                n_masked_per_seq = torch.nested.nested_tensor_from_jagged(
-                    is_masked, offsets=kwargs["cu_seq_lens_q"]
-                ).sum(1)
-                mask_ratio_observed = n_masked_per_seq.float() / src_lengths
-                scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
-                reshaped_scale_factor = torch.repeat_interleave(scale_factor, src_lengths, dim=0)
-                embeddings = (embeddings * reshaped_scale_factor.unsqueeze(-1)).to(embeddings.dtype)
         if self.layer_norm is not None:
             embeddings = self.layer_norm(embeddings)
@@ -622,12 +777,23 @@ class NVEsmForTokenClassification(NVEsmPreTrainedModel):
     Adapted from EsmForTokenClassification in Hugging Face Transformers `modeling_esm.py`.
     """
-    def __init__(self, config):
-        """Initialize NVEsmForTokenClassification."""
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.esm = NVEsmModel(config, add_pooling_layer=False)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = transformer_engine.pytorch.Linear(
             config.hidden_size,
@@ -653,7 +819,7 @@ class NVEsmForTokenClassification(NVEsmPreTrainedModel):
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
-        outputs = self.esm(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,

 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
+import warnings
+from contextlib import nullcontext
+from typing import ClassVar, ContextManager, Literal, Optional, Unpack
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
 import torch
+import transformer_engine.common.recipe
 import transformer_engine.pytorch
 from torch import nn
 from torch.nn import CrossEntropyLoss
         max_seq_length: Optional[int] = None,
         padded_vocab_size: Optional[int] = 64,
         attn_mask_type: str = "padding",
+        add_pooling_layer: bool = False,
+        layer_precision: list[str | None] | None = None,
+        use_quantized_model_init: bool = False,
         **kwargs,
     ):
         """Initialize the NVEsmConfig with additional TE-related config options.
                 `v` weights for each attention head are interleaved. This parameter is set to `False`
                 when using :attr:`fuse_qkv_params=False`.
             encoder_activation: The activation function to use in the encoder.
+            attn_input_format: The input format to use for the attention:
+                "bshd" = Batch, Sequence, Head, Dimension (standard padded format)
+                "thd"  = Total tokens (packed/unpadded), Head, Dimension (sequence packing format)
+                Note that these formats are very closely related to the `qkv_format` in the
                 `MultiHeadAttention` and `DotProductAttention` modules.
             fuse_qkv_params: Whether to fuse the qkv parameters. If set to `True`,
                 `TransformerLayer` module exposes a single fused parameter for query-key-value.
             padded_vocab_size: The padded vocabulary size to support FP8. If not provided, defaults
                 to vocab_size. Must be greater than or equal to vocab_size.
             attn_mask_type: The type of attention mask to use.
+            add_pooling_layer: Whether the base model should include a pooling layer.
+                Defaults to ``False`` because exported checkpoints do not contain pooler
+                weights. Set to ``True`` only if you have a checkpoint with pooler weights.
+            layer_precision: Per-layer quantization precision, a list of length ``num_hidden_layers``
+                where each element is ``"fp8"``, ``"fp4"``, or ``None`` (BF16 fallback). ``None``
+                (the default) means no quantization is configured.
+            use_quantized_model_init: Whether to use `quantized_model_init` for layer initialization.
             **kwargs: Additional config options to pass to EsmConfig.
         """
         super().__init__(**kwargs)
         self.micro_batch_size = micro_batch_size
         self.max_seq_length = max_seq_length
         self.attn_mask_type = attn_mask_type
+        self.add_pooling_layer = add_pooling_layer
+        self.layer_precision = layer_precision
+        self.use_quantized_model_init = use_quantized_model_init
         # Set padded_vocab_size with default fallback to vocab_size
+        self.padded_vocab_size = padded_vocab_size or self.vocab_size
         # Ensure padded_vocab_size is at least as large as vocab_size
         if self.padded_vocab_size is not None and self.vocab_size is not None:
                 f"padded_vocab_size ({self.padded_vocab_size}) must be greater than or equal to vocab_size ({self.vocab_size})"
             )
+        if layer_precision is not None:
+            if len(layer_precision) != self.num_hidden_layers:
+                raise ValueError(f"layer_precision must be a list of length {self.num_hidden_layers}")
+            for precision in layer_precision:
+                if precision not in {"fp8", "fp4", None}:
+                    raise ValueError(f'layer_precision element must be "fp8", "fp4", or None, got {precision!r}')
 class NVEsmEncoder(nn.Module):
     """NVEsmEncoder is a TransformerEngine-optimized ESM encoder."""
+    def __init__(
+        self,
+        config: NVEsmConfig,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
         """Initialize a NVEsmEncoder.
         Args:
             config (NVEsmConfig): The configuration of the model.
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
         """
         super().__init__()
         self.config = config
+        self._fp8_recipe: transformer_engine.common.recipe.Recipe | None = fp8_recipe
+        self._fp4_recipe: transformer_engine.common.recipe.Recipe | None = fp4_recipe
+        if self.config.layer_precision is None:
+            if fp8_recipe is not None and fp4_recipe is not None:
+                raise RuntimeError("Both FP8 and FP4 recipes provided, but no layer precision provided.")
+            if fp8_recipe is not None:
+                warnings.warn("No layer precision provided, using FP8 recipe for all layers.", UserWarning)
+                self.config.layer_precision = ["fp8"] * self.config.num_hidden_layers
+            elif fp4_recipe is not None:
+                raise RuntimeError(
+                    "FP4 recipe provided but no layer_precision configured. "
+                    "Set layer_precision explicitly when using FP4."
+                )
+        if self.config.layer_precision is not None and "fp4" in self.config.layer_precision and fp4_recipe is None:
+            raise RuntimeError("layer_precision contains 'fp4' entries but no fp4_recipe was provided.")
         def _init_method(x):
             torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range)
+        layers: list[transformer_engine.pytorch.TransformerLayer] = []
+        for i in range(config.num_hidden_layers):
+            with self.get_autocast_context(i, init=True):
+                layers += [
+                    transformer_engine.pytorch.TransformerLayer(
+                        hidden_size=config.hidden_size,
+                        ffn_hidden_size=config.intermediate_size,
+                        num_attention_heads=config.num_attention_heads,
+                        layernorm_epsilon=config.layer_norm_eps,
+                        hidden_dropout=config.hidden_dropout_prob,
+                        attention_dropout=config.attention_probs_dropout_prob,
+                        qkv_weight_interleaved=config.qkv_weight_interleaved,
+                        layer_number=i + 1,
+                        layer_type="encoder",
+                        self_attn_mask_type=config.attn_mask_type,
+                        activation=config.encoder_activation,
+                        attn_input_format=config.attn_input_format,
+                        seq_length=config.max_seq_length,
+                        micro_batch_size=config.micro_batch_size,
+                        num_gqa_groups=config.num_attention_heads,
+                        fuse_qkv_params=config.fuse_qkv_params,
+                        params_dtype=config.dtype,
+                        window_size=(-1, -1),
+                        device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+                        init_method=_init_method,
+                        output_layer_init_method=_init_method,
+                    )
+                ]
+        self.layers = nn.ModuleList(layers)
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(
             config.hidden_size,
             eps=config.layer_norm_eps,
         with torch.autocast(device_type="cuda", enabled=False):
             te_rope_emb = self.rotary_embeddings(max_seq_len=self.config.max_position_embeddings)
             te_rope_emb = te_rope_emb.to(hidden_states.device, non_blocking=True)
+            if te_rope_emb.dtype != torch.float32:
+                warnings.warn("Rotary embeddings should be in float32 for optimal performance.", UserWarning)
+        with self.get_autocast_context(None, outer=True):
+            for layer_idx, layer_module in enumerate(self.layers):
+                if kwargs.get("output_hidden_states", False):
+                    all_hidden_states = (*all_hidden_states, hidden_states)
+                with self.get_autocast_context(layer_idx):
+                    hidden_states = layer_module(
+                        hidden_states,
+                        attention_mask,
+                        rotary_pos_emb=te_rope_emb,
+                        cu_seqlens_q=kwargs.get("cu_seq_lens_q", None),
+                        cu_seqlens_kv=kwargs.get("cu_seq_lens_k", None),
+                        cu_seqlens_q_padded=kwargs.get("cu_seq_lens_q_padded", None),
+                        cu_seqlens_kv_padded=kwargs.get("cu_seq_lens_k_padded", None),
+                        max_seqlen_q=kwargs.get("max_length_q", None),
+                        max_seqlen_kv=kwargs.get("max_length_k", None),
+                        pad_between_seqs=kwargs.get("pad_between_seqs", None),
+                    )
         hidden_states = self.emb_layer_norm_after(hidden_states)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states or None,
         )
+    def get_autocast_context(
+        self, layer_number: int | None, init: bool = False, outer: bool = False
+    ) -> ContextManager:
+        """Return the appropriate TE autocast context manager for a given layer.
+        This function handles both the quantized_model_init during layer creation and the te.autocast() during layer
+        forward pass.
+        Args:
+            layer_number: The 0-indexed layer number.
+            init: Whether to return a `quantized_model_init` context for layer initialization.
+            outer: Whether to return a global te.autocast() context to wrap the entire encoder stack.
+        """
+        if self.config.layer_precision is None:
+            return nullcontext()
+        if outer:
+            # This is especially important for something like DelayedScaling, where we want to ensure recipe
+            # post-processing happens only once per forward pass.
+            if "fp8" not in self.config.layer_precision:
+                return nullcontext()
+            if self._fp8_recipe is None:
+                warnings.warn("No FP8 recipe provided, using default recipe.", UserWarning)
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=self._fp8_recipe)
+        precision = self.config.layer_precision[layer_number]
+        recipe = {"fp8": self._fp8_recipe, "fp4": self._fp4_recipe}.get(precision)
+        if init and self.config.use_quantized_model_init:
+            if precision == "fp4" and recipe is None:
+                raise RuntimeError("No FP4 recipe provided, but layer precision is set to FP4.")
+            if precision in ("fp8", "fp4"):
+                return transformer_engine.pytorch.quantized_model_init(recipe=recipe)
+            return nullcontext()
+        if precision == "fp8":
+            if recipe is None:
+                warnings.warn("No FP8 recipe provided, using default recipe.", UserWarning)
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=recipe)
+        if precision == "fp4":
+            if recipe is None:
+                raise RuntimeError("No FP4 recipe provided, but layer precision is set to FP4.")
+            return transformer_engine.pytorch.autocast(enabled=True, recipe=recipe)
+        return transformer_engine.pytorch.autocast(enabled=False)
 class NVEsmPreTrainedModel(EsmPreTrainedModel):
     """An abstract class to handle weights initialization and pretrained model loading."""
     config_class = NVEsmConfig
+    base_model_prefix = "model"
     supports_gradient_checkpointing = False
     accepts_loss_kwargs = False
     _no_split_modules = (
             if hasattr(module, "reset_parameters"):
                 module.reset_parameters()
+        # The embeddings layer is the only non-TE layer in this model we need to deal with. We use
         # `model._init_weights` rather than `reset_parameters` to ensure we honor the original config standard
+        # deviation.  self.base_model resolves to self.model for wrapper classes or self for NVEsmModel.
+        self.base_model.embeddings.word_embeddings.to_empty(device="cuda")
+        self.base_model.embeddings.apply(self._init_weights)
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
         super()._init_weights(module)
     def state_dict(self, *args, **kwargs):
+        """Override state_dict to filter out non-loadable keys.
+        Filters out:
+        - ``_extra_state`` keys: TransformerEngine-specific, not loadable by HuggingFace v5.
+        - ``.inv_freq`` buffers: Computed at init time by RotaryPositionEmbedding, not needed
+          in the checkpoint and not loadable by vLLM's AutoWeightsLoader (which only iterates
+          over ``named_parameters``, not ``named_buffers``).
         """
         state_dict = super().state_dict(*args, **kwargs)
+        return {k: v for k, v in state_dict.items() if not k.endswith("_extra_state") and not k.endswith(".inv_freq")}
 class NVEsmModel(NVEsmPreTrainedModel):
     This model uses NVDIA's TransformerEngine to optimize attention layer training and inference.
     """
+    def __init__(
+        self,
+        config: NVEsmConfig,
+        add_pooling_layer: Optional[bool] = None,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
         """Initialize a NVEsmModel.
         Args:
             config (NVEsmConfig): The configuration of the model.
+            add_pooling_layer (bool): Whether to add a pooling layer.  If ``None``,
+                reads ``config.add_pooling_layer`` (defaults to ``False``).
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
         """
         super().__init__(config)
         self.config = config
+        if add_pooling_layer is None:
+            add_pooling_layer = getattr(config, "add_pooling_layer", False)
         # Ensure pad_token_id is set properly, defaulting to 0 if not specified
         if not hasattr(config, "pad_token_id") or config.pad_token_id is None:
             config.pad_token_id = 0
         self.embeddings = NVEsmEmbeddings(config)
+        self.encoder = NVEsmEncoder(config, fp8_recipe, fp4_recipe)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
         # Initialize weights and apply final processing
         )
         encoder_outputs = self.encoder(
             embedding_output,
+            attention_mask=None if self.config.attn_input_format == "thd" else extended_attention_mask,
             **kwargs,
         )
         sequence_output = encoder_outputs[0]
 class NVEsmForMaskedLM(NVEsmPreTrainedModel):
     """NVEsmForMaskedLM is a TransformerEngine-optimized ESM model for masked language modeling."""
+    _tied_weights_keys: ClassVar[dict[str, str]] = {
+        "lm_head.decoder.weight": "model.embeddings.word_embeddings.weight"
+    }
+    _do_not_quantize = ("lm_head.dense", "lm_head.decoder")  # Flag for testing that these layers are not quantized.
+    def __init__(
+        self,
+        config: NVEsmConfig,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
         """Initialize a NVEsmForMaskedLM.
         Args:
             config (NVEsmConfig): The configuration of the model.
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
         """
         super().__init__(config)
                 "bi-directional self-attention."
             )
+        self.model = NVEsmModel(config, add_pooling_layer=False, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
         self.lm_head = NVEsmLMHead(config)
         self.post_init()
         Returns:
             MaskedLMOutput: The output of the model.
         """
+        outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             **kwargs,
         )
         sequence_output = outputs[0]
+        with transformer_engine.pytorch.autocast(enabled=False):
+            prediction_scores = self.lm_head(sequence_output)
         # Truncate logits back to original vocab_size if padding was used
         if self.config.padded_vocab_size != self.config.vocab_size:
             config (NVEsmConfig): The configuration of the model.
         """
         super().__init__()
+        with transformer_engine.pytorch.quantized_model_init(enabled=False):
+            self.dense = transformer_engine.pytorch.Linear(
+                config.hidden_size,
+                config.hidden_size,
+                params_dtype=config.dtype,
+                device="meta" if torch.get_default_device() == torch.device("meta") else "cuda",
+                init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
+            )
             self.decoder = transformer_engine.pytorch.LayerNormLinear(
                 config.hidden_size,
+                config.padded_vocab_size or config.vocab_size,
                 bias=True,
                 eps=config.layer_norm_eps,
                 params_dtype=config.dtype,
         """
         # Keep the last layers of the network in higher precision to avoid numerical instability.
         # Please see recipes/fp8_analysis/README.md for more details.
+        with transformer_engine.pytorch.autocast(enabled=False):
             x = self.dense(features)
             x = torch.nn.functional.gelu(x)
             x = self.decoder(x)
         self.token_dropout = config.token_dropout
         self.mask_token_id = config.mask_token_id
+    def _apply_token_dropout_bshd(self, embeddings, input_ids, attention_mask):
+        """Apply token dropout scaling for BSHD-format inputs.
+        Compensates for masked tokens by scaling unmasked embeddings based on the
+        observed mask ratio per sequence.
+        Args:
+            embeddings: Token embeddings with masked positions already zeroed out.
+            input_ids: Original input token IDs.
+            attention_mask: Attention mask indicating valid tokens.
+        Returns:
+            Scaled embeddings tensor.
+        """
+        mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
+        src_lengths = attention_mask.sum(-1) if attention_mask is not None else input_ids.shape[1]
+        n_masked_per_seq = (input_ids == self.mask_token_id).sum(-1).float()
+        mask_ratio_observed = n_masked_per_seq / src_lengths
+        scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
+        return (embeddings * scale_factor[:, None, None]).to(embeddings.dtype)
+    def _apply_token_dropout_thd(self, embeddings, input_ids, kwargs):
+        """Apply token dropout scaling for THD-format (packed sequence) inputs.
+        Uses cumulative sequence lengths to compute per-sequence mask ratios and
+        scales embeddings accordingly using repeat_interleave.
+        Args:
+            embeddings: Token embeddings with masked positions already zeroed out.
+            input_ids: Original input token IDs.
+            kwargs: Additional keyword arguments containing cu_seq_lens_q and optionally cu_seq_lens_q_padded.
+        Returns:
+            Scaled embeddings tensor.
+        """
+        mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
+        src_lengths = torch.diff(kwargs["cu_seq_lens_q"])
+        if "cu_seq_lens_q_padded" in kwargs:
+            src_lengths_padded = torch.diff(kwargs["cu_seq_lens_q_padded"])
+        else:
+            src_lengths_padded = src_lengths
+        # We need to find the number of masked tokens in each sequence in the padded batch.
+        is_masked = (input_ids == self.mask_token_id).squeeze(0)
+        n_masked_per_seq = torch.nested.nested_tensor_from_jagged(is_masked, offsets=kwargs["cu_seq_lens_q"]).sum(1)
+        mask_ratio_observed = n_masked_per_seq.float() / src_lengths
+        scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
+        reshaped_scale_factor = torch.repeat_interleave(scale_factor, src_lengths_padded, dim=0)
+        return (embeddings * reshaped_scale_factor.unsqueeze(-1)).to(embeddings.dtype)
     def forward(
         self,
         input_ids=None,
         # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
         if self.token_dropout and input_ids is not None:
             embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
+            if using_thd:
+                embeddings = self._apply_token_dropout_thd(embeddings, input_ids, kwargs)
             else:
+                embeddings = self._apply_token_dropout_bshd(embeddings, input_ids, attention_mask)
         if self.layer_norm is not None:
             embeddings = self.layer_norm(embeddings)
     Adapted from EsmForTokenClassification in Hugging Face Transformers `modeling_esm.py`.
     """
+    def __init__(
+        self,
+        config,
+        fp8_recipe: transformer_engine.common.recipe.Recipe | None = None,
+        fp4_recipe: transformer_engine.common.recipe.Recipe | None = None,
+    ):
+        """Initialize NVEsmForTokenClassification.
+        Args:
+            config: The configuration of the model.
+            fp8_recipe: The FP8 recipe for the encoder.
+            fp4_recipe: The FP4 recipe for the encoder.
+        """
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.model = NVEsmModel(config, add_pooling_layer=False, fp8_recipe=fp8_recipe, fp4_recipe=fp4_recipe)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = transformer_engine.pytorch.Linear(
             config.hidden_size,
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
         """
+        outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10372d170244961a66c93dcf8fa00fa65e9fc614ac5b879e1c06eb1df4532d95
-size 11356390152

 version https://git-lfs.github.com/spec/v1
+oid sha256:fc8257e1f816a628921060e555af43750029fde945a6a44afb0df1812915d097
+size 11356073172

tokenizer_config.json CHANGED Viewed

@@ -11,7 +11,6 @@
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
-  "model_specific_special_tokens": {},
   "pad_token": "<pad>",
   "tokenizer_class": "TokenizersBackend",
   "unk_token": "<unk>"

     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
   "tokenizer_class": "TokenizersBackend",
   "unk_token": "<unk>"