fixie-ai
/

ultravox-v0_5-llama-3_2-1b

Audio-Text-to-Text

feature-extraction

Model card Files Files and versions

trfms-fix

#9

by eustlb HF Staff - opened Feb 24

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

Files changed (1) hide show

ultravox_model.py +16 -12

ultravox_model.py CHANGED Viewed

@@ -426,12 +426,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
         # We probably don't want to pass tp_plan or device_map to the audio tower
         # But potentially other kwargs can be passed in. TODO
         kwargs = {"torch_dtype": config.torch_dtype}
-        _default_device = getattr(torch, "get_default_device", lambda: None)()
-        _is_init = _default_device is None or _default_device.type != "meta"
-        if (
-            _is_init
-            and config.audio_model_id is not None
-        ):
             if "whisper" in config.audio_model_id.lower():
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, **kwargs
@@ -482,12 +484,14 @@ class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
-        _default_device = getattr(torch, "get_default_device", lambda: None)()
-        _is_init = _default_device is None or _default_device.type != "meta"
-        if (
-            _is_init
-            and config.text_model_id is not None
-        ):
             language_model = transformers.AutoModelForCausalLM.from_pretrained(
                 config.text_model_id,
                 **{

         # We probably don't want to pass tp_plan or device_map to the audio tower
         # But potentially other kwargs can be passed in. TODO
         kwargs = {"torch_dtype": config.torch_dtype}
+        if hasattr(transformers.modeling_utils, "_init_weights"):
+            # v4 path
+            is_init = transformers.modeling_utils._init_weights
+        else:
+            # v5 path
+            _default_device = getattr(torch, "get_default_device", lambda: None)()
+            is_init = _default_device is None or _default_device.type != "meta"
+        if is_init and config.audio_model_id is not None:
             if "whisper" in config.audio_model_id.lower():
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, **kwargs
     def _create_language_model(
         cls, config: UltravoxConfig
     ) -> transformers.LlamaForCausalLM:
+        if hasattr(transformers.modeling_utils, "_init_weights"):
+            # v4 path
+            is_init = transformers.modeling_utils._init_weights
+        else:
+            # v5 path
+            _default_device = getattr(torch, "get_default_device", lambda: None)()
+            is_init = _default_device is None or _default_device.type != "meta"
+        if is_init and config.text_model_id is not None:
             language_model = transformers.AutoModelForCausalLM.from_pretrained(
                 config.text_model_id,
                 **{