fixie-ai
/

ultravox-v0_4-mistral_nemo

Safetensors

ultravox

custom_code

Model card Files Files and versions

xet

Community

farzadab commited on Apr 9, 2025

Commit

494d2fb

verified ·

1 Parent(s): 009e6a9

Update ultravox_model.py

Browse files

Files changed (1) hide show

ultravox_model.py +333 -113

ultravox_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
-from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
 import torch
@@ -9,6 +10,7 @@ import transformers
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
@@ -18,7 +20,7 @@ from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
-class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
@@ -34,29 +36,72 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
-    _no_split_modules = ["Wav2Vec2Model", "WhisperEncoder", "LlamaDecoderLayer"]
-    # We minimize the weights in state_dict in order to reduce the size of the checkpoint
-    # The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
-    # As such we have to tell is to ignore some keys that are not always in the model
-    _keys_to_ignore_on_load_unexpected = ["audio_tower.*", "language_model.*"]
-    # Usually we load encoder weights from a pretrained model, so we don't want to load the decoder weights
-    # Technically we never hit this issue because these keys are already removed from state_dict() however,
-    # but there's no harm in keeping it here for when we change that behavior.
-    _keys_to_ignore_on_load_missing = ["audio_tower.*"]
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
         self.keep_params: Set[str] = set()
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
-        self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = self._create_language_model(config)
         self.loss_config = LossConfig()
         self.post_init()
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
@@ -103,6 +148,30 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _compute_kl_loss(
         self,
         lm_output: transformers.modeling_outputs.CausalLMOutputWithPast,
@@ -127,11 +196,12 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         # compute the KL divergence loss between the two models
         kl_loss = F.kl_div(
             F.log_softmax(
-                lm_output.logits[labels != -100] / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
-                alt_lm_output.logits[alt_labels != -100]
                 / self.loss_config.kl_temperature,
                 dim=-1,
             ),
@@ -139,6 +209,24 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
         return {"loss": kl_loss}
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -147,7 +235,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
@@ -178,28 +268,37 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
-        if audio_values is not None:
             assert (
-                audio_token_start_idx is not None and audio_token_len is not None
-            ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided."
             assert (
-                len(audio_token_start_idx) == len(audio_token_len) == len(audio_values)
-            ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size."
-            # B x A/3200 x D
             audio_tower_output = self.audio_tower.forward(
-                audio_values
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
-            for i, (audio, start, length) in enumerate(
-                zip(audio_embeds, audio_token_start_idx, audio_token_len)
-            ):
-                length = min(length, audio.shape[0])
-                inputs_embeds[i, start : start + length] = audio[:length]
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -234,6 +333,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -251,7 +352,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         # include audio information in model_input only when it is needed during prefilling
         # audio_token_start_idx should always be relative to the current cache position
-        prefill_start_idx = 0 if cache_position is None else cache_position[0]
         if (
             audio_values is not None
             and audio_token_start_idx is not None
@@ -262,32 +365,37 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
         return model_input
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
-        if config.audio_model_id is not None:
-            if "whisper" in config.audio_model_id is not None:
-                audio_tower = ModifiedWhisperEncoder.from_pretrained(
-                    config.audio_model_id
-                )
-            else:
-                audio_tower = transformers.AutoModel.from_pretrained(
-                    config.audio_model_id
-                )
-        else:
-            if "whisper" in config.audio_config._name_or_path:
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
             else:
-                with transformers.modeling_utils.no_init_weights():
-                    # we only ever use from_config if the weights are retrained, hence initializing is not
-                    # required. This makes the model quite creation faster since init on CPU is quite slow.
-                    audio_tower = transformers.AutoModel.from_config(
-                        config.audio_config
-                    )
         if isinstance(
             audio_tower,
@@ -305,23 +413,22 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
-        if config.text_model_id is not None:
-            language_model = transformers.AutoModelForCausalLM.from_pretrained(
-                config.text_model_id, attn_implementation=config._attn_implementation
             )
-        else:
-            with transformers.modeling_utils.no_init_weights():
-                # we only ever use from_config if the weights are retrained, hence initializing is not
-                # required. This makes the model quite creation faster since init on CPU is quite slow.
-                language_model = transformers.AutoModelForCausalLM.from_config(
-                    config.text_config, attn_implementation=config._attn_implementation
-                )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
-    def _add_language_model_weights_to_keep(self):
-        if self.config.text_model_id is not None:
             self.config.text_model_id = None
             self.keep_params.update(
                 set(
@@ -332,8 +439,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 )
             )
-    def _add_audio_tower_weights_to_keep(self):
-        if self.config.audio_model_id is not None:
             self.config.audio_model_id = None
             self.keep_params.update(
                 set(
@@ -344,46 +452,44 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 )
             )
-    def merge_and_unload(self):
-        if isinstance(self.language_model, peft.PeftModel):
-            self.language_model = self.language_model.merge_and_unload()
-            # no need to download base language model weights anymore, so we can remove the id
-            self._add_language_model_weights_to_keep()
-        if isinstance(self.audio_tower, peft.PeftModel):
-            self.audio_tower = self.audio_tower.merge_and_unload()
-            # no need to download base audio model weights anymore, so we can remove the id
-            self._add_audio_tower_weights_to_keep()
         for param in ["text_model_lora_config", "audio_model_lora_config"]:
             if hasattr(self.config, param):
                 delattr(self.config, param)
     def push_to_hub(self, *args, **kwargs):
         self.merge_and_unload()
-        self.to(self.language_model.dtype)
         return super().push_to_hub(*args, **kwargs)
-    def state_dict(self, *args, **kwargs):
-        named_params = dict(self.named_parameters())
-        state_dict = super().state_dict(*args, **kwargs)
         state_dict = {
             k: v
             for k, v in state_dict.items()
-            if k in self.keep_params
-            or (k in named_params and named_params[k].requires_grad)
         }
         return state_dict
-    def load_state_dict(
-        self,
-        state_dict: Dict[str, Any],
-        *args,
-        **kwargs,
     ):
         self.keep_params.update(set(state_dict.keys()))
-        return super().load_state_dict(state_dict, *args, **kwargs)
     def print_trainable_parameters(self):
         """
@@ -414,8 +520,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
 def is_cache_empty(
-    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
     """
     Check if the cache is empty.
@@ -427,16 +534,25 @@ def is_cache_empty(
     return past_key_values.get_seq_length() == 0
-def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
     lora_config = peft.LoraConfig(**lora_config or {})
     if lora_config.r == 0:
-        # freeze the model entirely
-        for param in model.parameters():
-            param.requires_grad = False
     else:
         model = peft.get_peft_model(model, lora_config)
@@ -445,12 +561,8 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
 class StackAudioFrames(nn.Module):
     """
-    Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
-    The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
-    NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
-    we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
-    In most cases this extra padding will get removed in the model's forward function so it has no effect.
     """
     def __init__(self, stack_factor: int = 8):
@@ -460,7 +572,7 @@ class StackAudioFrames(nn.Module):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
-        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
@@ -480,31 +592,67 @@ class SwiGLU(nn.Module):
         return F.silu(gate) * x
-class UltravoxProjector(nn.Sequential):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
-        dim = config.audio_config.hidden_size * config.stack_factor
-        self.ln_pre = RMSNorm(dim, init=config.norm_init)
-        self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False)
-        dim = self.hidden_dim
         self.act = transformers.activations.get_activation(config.projector_act)
-        dim = dim // 2 if config.projector_act == "swiglu" else dim
-        self.linear_2 = nn.Linear(dim, config.text_config.hidden_size, bias=False)
-        self.ln_post = RMSNorm(config.text_config.hidden_size, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
-class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     """
     Encoder portion of OpenAI's Whisper model.
@@ -518,21 +666,62 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
     """
     base_model_prefix = "model.encoder"
     def forward(
         self,
         input_features,
-        attention_mask=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
-        expected_seq_length = (
-            self.config.max_source_positions
-            * self.conv1.stride[0]
-            * self.conv2.stride[0]
-        )
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
@@ -565,6 +754,37 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
@@ -588,14 +808,14 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
-                        None,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
-                        None,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),
@@ -630,4 +850,4 @@ UltravoxModel.register_for_auto_class()
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
-transformers.activations.ACT2FN["swiglu"] = SwiGLU

 import logging
+import re
+from typing import Any, Dict, Generator, Optional, Set, Tuple, TypeVar, Union
 import peft
 import torch
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
+from transformers.generation.utils import GenerationMixin
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
 from .ultravox_config import UltravoxConfig
+class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     config_class = UltravoxConfig
     config: UltravoxConfig  # for type hinting
+    # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
+    _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
+    # Since we have kwargs in forward, we need to set this to False, otherwise grad_accum_steps will cause incorrect train loss to be reported
+    # see https://github.com/huggingface/transformers/issues/35856 and https://github.com/huggingface/trl/pull/2615/files
+    accepts_loss_kwargs = False
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
+        self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook)
         self.keep_params: Set[str] = set()
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
+        self.audio_tower_context_length: Optional[int] = None
+        self.audio_tower_context_length = self.audio_tower.max_context_length
+        self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"language_model.{k}" for k in self.language_model._tied_weights_keys
+            ]
+        # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
+        # FSDP throws an error if some of the layer types are not found in the model.
+        # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
+        self._no_split_modules = self.language_model._no_split_modules
         self.loss_config = LossConfig()
         self.post_init()
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        model = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        model._load_child_model_weights(*args, **kwargs)
+        return model
+    def _load_child_model_weights(self, *args, **kwargs) -> "UltravoxModel":
+        if (
+            self.config.text_model_id is not None
+            and self.language_model.device.type == "meta"
+        ):
+            # Load the language model weights
+            self.language_model = transformers.AutoModelForCausalLM.from_pretrained(
+                self.config.text_model_id,
+                torch_dtype=self.config.torch_dtype,
+                *args,
+                **kwargs,
+            )
+        if (
+            self.config.audio_model_id is not None
+            and self.audio_tower.device.type == "meta"
+        ):
+            # Load the audio tower weights
+            self.audio_tower = transformers.AutoModel.from_pretrained(
+                self.config.audio_model_id,
+                torch_dtype=self.config.torch_dtype,
+                *args,
+                **kwargs,
+            )
+        return self
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
+    def _get_prediction_mask(self, labels: Optional[torch.Tensor]) -> torch.Tensor:
+        """Get a boolean mask for positions where we want to compute KL divergence.
+        For each label position, we want the position before it since that's where
+        the model makes the prediction for that label.
+        Args:
+            labels: Tensor of shape (B, T) where B is batch size and T is sequence length,
+                   with -100 for masked positions and token ids for label positions
+        Returns:
+            Boolean tensor of shape (B, T) that's True for positions where we want to compute KL divergence
+        """
+        if labels is None:
+            raise ValueError("labels must be provided")
+        # Shift the label mask right by 1 along the sequence dimension
+        # This gives us positions where we make predictions for the next token
+        label_mask = labels != -100
+        pred_mask = torch.zeros_like(label_mask)
+        pred_mask[:, :-1] = label_mask[
+            :, 1:
+        ]  # shift right by 1 along sequence dimension
+        return pred_mask
     def _compute_kl_loss(
         self,
         lm_output: transformers.modeling_outputs.CausalLMOutputWithPast,
         # compute the KL divergence loss between the two models
         kl_loss = F.kl_div(
             F.log_softmax(
+                lm_output.logits[self._get_prediction_mask(labels)]
+                / self.loss_config.kl_temperature,
                 dim=-1,
             ),
             F.softmax(
+                alt_lm_output.logits[self._get_prediction_mask(alt_labels)]
                 / self.loss_config.kl_temperature,
                 dim=-1,
             ),
         )
         return {"loss": kl_loss}
+    def _audio_iter(
+        self, audio_batch_size: torch.Tensor
+    ) -> Generator[Tuple[int, int], None, None]:
+        """
+        Iterate over the audio batch size and yield the batch index and audio index of each audio item.
+        Args:
+            audio_batch_size: A tensor of shape (B,) where B is the batch size.
+        Returns:
+            A generator that yields a tuple of (start index, length) for each audio item.
+        """
+        audio_index = 0
+        for i_b, batch_count in enumerate(audio_batch_size):
+            for _ in range(batch_count):
+                yield i_b, audio_index
+                audio_index += 1
     def forward(
         self,
         input_ids: torch.Tensor,
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
+        if audio_values is not None and len(audio_values) > 0:
             assert (
+                audio_token_start_idx is not None
+                and audio_token_len is not None
+                and audio_lens is not None
+                and audio_batch_size is not None
+            ), "audio_token_start_idx/audio_token_len/audio_lens must be provided if audio_values are provided."
             assert (
+                len(audio_token_start_idx)
+                == len(audio_token_len)
+                == len(audio_lens)
+                == len(audio_values)
+            ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
+            assert len(audio_batch_size) == len(
+                inputs_embeds
+            ), "audio_batch_size and inputs_embeds must have the same batch size."
+            # B x A/3200 x (D=max-audio-length-in-batch)
             audio_tower_output = self.audio_tower.forward(
+                audio_values.to(self.audio_tower.dtype),
+                audio_len=audio_lens,
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
+            for i_b, i_a in self._audio_iter(audio_batch_size):
+                start_idx = audio_token_start_idx[i_a]
+                token_len = audio_token_len[i_a]
+                item_embedding = audio_embeds[i_a][:token_len]
+                inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         # include audio information in model_input only when it is needed during prefilling
         # audio_token_start_idx should always be relative to the current cache position
+        prefill_start_idx: int | torch.Tensor = (
+            0 if cache_position is None else cache_position[0]
+        )
         if (
             audio_values is not None
             and audio_token_start_idx is not None
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
+            model_input["audio_batch_size"] = audio_batch_size
+            model_input["audio_lens"] = audio_lens
         return model_input
+    @classmethod
+    def _create_multi_modal_projector(
+        cls, config: UltravoxConfig
+    ) -> "UltravoxProjector":
+        projector = UltravoxProjector(config)
+        projector.to(config.torch_dtype)
+        return projector
     @classmethod
     def _create_audio_tower(
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
+        with transformers.modeling_utils.no_init_weights():
+            # we only ever use from_config if the weights are retrained, hence initializing is not
+            # required. This makes the model quite creation faster since init on CPU is quite slow.
+            if "whisper" in config.audio_config._name_or_path.lower():
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
+                audio_tower.init_latency_mask(
+                    config.audio_latency_block_size, dtype=config.torch_dtype
+                )
             else:
+                assert config.audio_latency_block_size in (
+                    None,
+                    0,
+                ), "only whisper audio tower supports audio latency masking, got non-zero value for 'audio_latency_block_size'"
+                audio_tower = transformers.AutoModel.from_config(config.audio_config)
         if isinstance(
             audio_tower,
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
+        with transformers.modeling_utils.no_init_weights():
+            # we only ever use from_config if the weights are retrained, hence initializing is not
+            # required. This makes the model quite creation faster since init on CPU is quite slow.
+            language_model = transformers.AutoModelForCausalLM.from_config(
+                config.text_config,
+                attn_implementation=config.text_config._attn_implementation,
+                torch_dtype=config.torch_dtype,
             )
         language_model = apply_lora(language_model, config.text_model_lora_config)
         return language_model
+    def merge_and_unload(self):
+        if isinstance(self.language_model, peft.PeftModel):
+            self.language_model = self.language_model.merge_and_unload()
+            # no need to download base language model weights anymore, so we can remove the id
             self.config.text_model_id = None
             self.keep_params.update(
                 set(
                 )
             )
+        if isinstance(self.audio_tower, peft.PeftModel):
+            self.audio_tower = self.audio_tower.merge_and_unload()
+            # no need to download base audio model weights anymore, so we can remove the id
             self.config.audio_model_id = None
             self.keep_params.update(
                 set(
                 )
             )
         for param in ["text_model_lora_config", "audio_model_lora_config"]:
             if hasattr(self.config, param):
                 delattr(self.config, param)
     def push_to_hub(self, *args, **kwargs):
         self.merge_and_unload()
         return super().push_to_hub(*args, **kwargs)
+    def diff_state_dict(
+        self, state_dict: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        if state_dict is None:
+            state_dict = super().state_dict()
+        trainable_params = {k for k, v in self.named_parameters() if v.requires_grad}
+        # normalize the keys to match the original model
+        # Example: audio_tower.base_model.model.layers.0._fsdp_wrapped_module.self_attn.k_proj.lora_B.default.weight
+        trainable_params = {
+            k.replace("_fsdp_wrapped_module.", "") for k in trainable_params
+        }
         state_dict = {
             k: v
             for k, v in state_dict.items()
+            if k in self.keep_params or k in trainable_params
         }
         return state_dict
+    def save_pretrained(
+        self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
     ):
+        state_dict = self.diff_state_dict(state_dict)
+        super().save_pretrained(*args, state_dict=state_dict, **kwargs)
+    def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
         self.keep_params.update(set(state_dict.keys()))
     def print_trainable_parameters(self):
         """
         )
+# TODO: refactor common parts to a shared module
 def is_cache_empty(
+    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],
 ) -> bool:
     """
     Check if the cache is empty.
     return past_key_values.get_seq_length() == 0
+T = TypeVar("T", bound=torch.nn.Module)
+def apply_lora(model: T, lora_config: dict) -> T:
     """
     Applies LoRA finetuning to the model. If the `r` parameter is set to 0, the model is frozen instead.
     """
+    unfreeze_layers = lora_config.pop("unfreeze_layers", None)
     lora_config = peft.LoraConfig(**lora_config or {})
     if lora_config.r == 0:
+        # freeze the model entirely, except for the specified layers
+        for name, param in model.named_parameters():
+            if not unfreeze_layers or not any(
+                re.match(layer, name) for layer in unfreeze_layers
+            ):
+                param.requires_grad = False
+            else:
+                logging.info(f"Unfreezing layer: {name} with #{param.numel()} params")
     else:
         model = peft.get_peft_model(model, lora_config)
 class StackAudioFrames(nn.Module):
     """
+    Stack the audio embedding frames to reduce the sequence length by a factor
+    of `stack_factor`.
     """
     def __init__(self, stack_factor: int = 8):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
         return F.silu(gate) * x
+class UltravoxProjector(nn.Module):
     def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+        self.ln_pre = RMSNorm(dim_in, init=config.norm_init)
+        self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
+        dim_mid = self.hidden_dim
         self.act = transformers.activations.get_activation(config.projector_act)
+        dim_mid = dim_mid // 2 if config.projector_act == "swiglu" else dim_mid
+        dim_out = config.text_config.hidden_size
+        self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
+        # Ultravox v0.4.1 and below uses layer_norm after the second linear layer,
+        # while v0.5.0 and above uses layer_norm after the first linear layer.
+        if config.projector_ln_mid:
+            self.ln_mid: nn.Module = RMSNorm(dim_mid, init=config.norm_init)
+            self.ln_post: nn.Module = nn.Identity()
+        else:
+            self.ln_mid = nn.Identity()
+            self.ln_post = RMSNorm(dim_out, init=config.norm_init)
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
+        """
+        Takes in audio features from the audio tower and projects them to the text model's embedding space.
+        It reduces the number of frames by a factor of `stack_factor` and increases the number of channels by the same factor.
+        If the number of audio frames are not a multiple of the stack factor, the last few frames will be padded with zeros.
+        Input shape:
+            audio_features: B, T*S, C
+        Output shape:
+            hidden_states: B, T, D
+        Where:
+            B: batch size
+            F: number of frames in the audio tower
+            T: number of output embeddings
+                T = ceil(F / S)
+            S: stack factor
+            C: number of channels out of the encoder (aka audio tower)
+            H: hidden size of the projector (config.hidden_size)
+            D: dimension of the text model (config.text_config.hidden_size)
+        """
+        # B, F, C -> B, T, C*S
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
+        # B, T, C*S -> B, T, H
         hidden_states = self.linear_1(audio_features)
+        # B, T, H -> B, T, H/2 (assuming swiglu)
         hidden_states = self.act(hidden_states)
+        hidden_states = self.ln_mid(hidden_states)
+        # B, T, H/2 -> B, T, D
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
+class ModifiedWhisperEncoder(
+    whisper.WhisperEncoder, transformers.modeling_utils.ModuleUtilsMixin
+):
     """
     Encoder portion of OpenAI's Whisper model.
     """
     base_model_prefix = "model.encoder"
+    _no_split_modules = ["WhisperEncoderLayer"]
+    _keys_to_ignore_on_load_unexpected = ["model.decoder.*"]
+    def __init__(self, config: transformers.WhisperConfig):
+        super().__init__(config)
+        self.config.is_decoder = False
+    @property
+    def max_context_length(self):
+        return (
+            self.config.max_source_positions
+            * self.conv1.stride[0]
+            * self.conv2.stride[0]
+        )
+    def init_latency_mask(
+        self, audio_latency_block_size: int | None, dtype: torch.dtype
+    ):
+        if audio_latency_block_size is None:
+            self.audio_streaming_mask = None
+            return
+        # Use max_context_length directly in the calculation
+        max_seqlen = self.max_context_length
+        assert (
+            max_seqlen > 0
+        ), f"maximum sequence length must be positive, got {max_seqlen}"
+        assert (
+            max_seqlen % audio_latency_block_size == 0
+        ), f"audio_latency_block_size {audio_latency_block_size} must divide {max_seqlen} evenly."
+        # Given the block size, we calculate number of blocks.
+        audio_latency_nblocks = max_seqlen // audio_latency_block_size
+        audio_streaming_mask = (
+            torch.tril(
+                torch.ones(audio_latency_nblocks, audio_latency_nblocks),
+                diagonal=0,
+            )
+            .repeat_interleave(audio_latency_block_size, dim=0)
+            .repeat_interleave(audio_latency_block_size, dim=1)
+        )
+        audio_streaming_mask = (1.0 - audio_streaming_mask) * torch.finfo(dtype).min
+        audio_streaming_mask = audio_streaming_mask[None, None, :, :]
+        self.register_buffer(
+            "audio_streaming_mask", audio_streaming_mask, persistent=False
+        )
     def forward(
         self,
         input_features,
+        audio_len=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
+        expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+        # Create attention mask based on audio lengths to mask out padding tokens
+        # For each sample in batch:
+        # - Convert raw audio length to feature length after convolutions
+        # - Create boolean mask that is True for valid positions and False for padding
+        # - Convert to extended attention mask format expected by transformer layers
+        #   (1.0 for positions to attend to, large negative for positions to ignore)
+        # This masking ensures consistent behavior between training and inference
+        # by preventing the model from attending to padding tokens in both cases
+        attention_mask = None
+        if audio_len != None:
+            audio_feature_len = self._get_feat_extract_output_lengths(audio_len)
+            max_seq_len = hidden_states.shape[1]
+            attention_mask = torch.arange(max_seq_len, device=hidden_states.device)[
+                None, :
+            ].lt(audio_feature_len.view(-1, 1))
+            attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                None,
+                dtype=hidden_states.dtype,
+            )
+        if self.audio_streaming_mask is not None:
+            seqlen = hidden_states.size(-2)
+            if attention_mask is not None:
+                attention_mask = torch.minimum(
+                    self.audio_streaming_mask[:, :, :seqlen, :seqlen], attention_mask
+                )  # merge
+            else:
+                attention_mask = self.audio_streaming_mask[:, :, :seqlen, :seqlen]
+            attention_mask = attention_mask.to(hidden_states.dtype)
         # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             assert head_mask.size()[0] == (
                     layer_outputs = self._gradient_checkpointing_func(
                         encoder_layer.__call__,
                         hidden_states,
+                        attention_mask,
                         (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
+                        attention_mask,
                         layer_head_mask=(
                             head_mask[idx] if head_mask is not None else None
                         ),
 transformers.AutoConfig.register("ultravox", UltravoxConfig)
 transformers.AutoModel.register(UltravoxConfig, UltravoxModel)
+transformers.activations.ACT2FN["swiglu"] = SwiGLU