From 1cb823d078ec4825eb12ca16eb961e8e54ea4b66 Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad@fixie.ai>
Date: Wed, 7 May 2025 14:43:46 -0700
Subject: [PATCH 1/5] audio token fixes for llama 4 and gemma 3 support
 (squashed)

Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
---
 vllm/model_executor/models/registry.py      |  1 +
 vllm/model_executor/models/ultravox.py      | 37 ++++++++++++---------
 vllm/transformers_utils/config.py           |  2 +-
 vllm/transformers_utils/configs/ultravox.py | 22 +++++++-----
 4 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index faeaf6ef68cc..55b7a43d5f25 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -77,6 +77,7 @@
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 94f5e03fd446..fa262c349134 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -39,9 +39,7 @@
                     merge_multimodal_embeddings,
                     merge_multimodal_embeddings_from_map)
 
-_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
-_AUDIO_PLACEHOLDER_TOKEN = 128002
-_AUDIO_TOKENS_PER_SECOND = 6.25
+_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>"
 _MAX_ENCODER_BATCH_SIZE = 16
 
 
@@ -80,14 +78,15 @@ def get_hf_processor(
         sampling_rate: Optional[int] = None,
         **kwargs: object,
     ) -> ProcessorMixin:
+        config = self.ctx.model_config.hf_config
         hf_processor = self.ctx.get_hf_processor(**kwargs)
 
         # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
         # placeholder that will cause confusion with the actual end of turn
-        # token, thus we override placeholder with a reserved special
-        # token.
+        # token, thus we override placeholder with a reserved token.
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
-        hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN
+        hf_processor.audio_replacement_token_id = config.audio_token_index
+
         return hf_processor
 
     def get_feature_extractor(
@@ -268,7 +267,7 @@ def __init__(self, config: UltravoxConfig):
         else:
             self.act = get_act_fn(config.projector_act)
 
-        dim_out = config.text_config.hidden_size
+        dim_out = config.text_hidden_size
         self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
 
         # Ultravox v0.4.1 and below use layer_norm after the second linear layer
@@ -559,9 +558,13 @@ def get_input_embeddings(
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
-        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        if multimodal_embeddings is not None \
-            and len(multimodal_embeddings) != 0:
+        # The audio token index is not included in the embedding table
+        # We need to remove it before embedding lookup
+        safe_input_ids = input_ids.clone()
+        safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
+        inputs_embeds = self.language_model.get_input_embeddings(
+            safe_input_ids)
+        if multimodal_embeddings is not None:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1:
@@ -572,7 +575,7 @@ def get_input_embeddings(
             else:
                 inputs_embeds = merge_multimodal_embeddings(
                     input_ids, inputs_embeds, multimodal_embeddings,
-                    _AUDIO_PLACEHOLDER_TOKEN)
+                    self.config.audio_token_index)
         return inputs_embeds
 
     def forward(self,
@@ -610,10 +613,14 @@ def forward(self,
                                                       multimodal_embeddings)
             input_ids = None
 
-        hidden_states = self.language_model.model(input_ids,
-                                                  positions,
-                                                  intermediate_tensors,
-                                                  inputs_embeds=inputs_embeds)
+        language_model = self.language_model
+        if hasattr(language_model, "language_model"):
+            language_model = language_model.language_model
+
+        hidden_states = language_model.model(input_ids,
+                                             positions,
+                                             intermediate_tensors,
+                                             inputs_embeds=inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 52a7a903cd8e..c124f8b27f69 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -800,7 +800,7 @@ def get_hf_text_config(config: PretrainedConfig):
         #  thinker_config.text_config.
         return config.thinker_config.text_config
 
-    text_config = config.get_text_config()
+    text_config = config.get_text_config().get_text_config()
 
     if text_config is not config:
         # The code operates under the assumption that text_config should have
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 62f63b02d49a..87064cc12ded 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
     """
 
     model_type = "ultravox"
+    audio_token = "<|audio|>"
     is_composition = False
 
     def __init__(
@@ -80,29 +81,32 @@ def __init__(
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.text_config = get_config(text_model_id,
-                                          trust_remote_code=False)
+            text_config_obj = get_config(text_model_id,
+                                         trust_remote_code=False)
         else:
             text_config = text_config or {}
-            self.text_config = transformers.CONFIG_MAPPING[text_config.get(
+            text_config_obj = transformers.CONFIG_MAPPING[text_config.get(
                 "model_type", "llama")](**text_config)
 
+        inner_text_config = text_config_obj.get_text_config()
+
         if audio_model_id is not None:
             # Avoid circular import
             from vllm.transformers_utils.config import get_config
 
-            self.audio_config = get_config(audio_model_id,
-                                           trust_remote_code=False)
+            audio_config = get_config(audio_model_id, trust_remote_code=False)
         else:
             audio_config = audio_config or {}
-            self.audio_config = transformers.CONFIG_MAPPING[audio_config.get(
+            audio_config = transformers.CONFIG_MAPPING[audio_config.get(
                 "model_type", "whisper")](**audio_config)
 
+        self.text_config = text_config_obj
+        self.audio_config = audio_config
         self.text_model_lora_config = text_model_lora_config or {}
         self.audio_model_lora_config = audio_model_lora_config or {}
 
-        self.vocab_size = self.text_config.vocab_size
-
-        self.initializer_range = self.text_config.initializer_range
+        self.vocab_size = inner_text_config.vocab_size
+        self.initializer_range = inner_text_config.initializer_range
+        self.text_hidden_size = inner_text_config.hidden_size
 
         super().__init__(**kwargs)

From fb573620999b57d8ac777f4375c4081bd4fd6027 Mon Sep 17 00:00:00 2001
From: Patrick Li <patrick8289@gmail.com>
Date: Wed, 23 Jul 2025 12:05:27 -0700
Subject: [PATCH 2/5] Remove Llama4ForCausalLM from model registry

Signed-off-by: Patrick Li <patrick8289@gmail.com>
---
 vllm/model_executor/models/registry.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 55f33d8704e0..2aaac7798fc0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -88,7 +88,6 @@
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
-    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),

From 02c9ca8cdc7d6b78cdcfa9d873db64bcb9b1d0c6 Mon Sep 17 00:00:00 2001
From: Patrick Li <patrick8289@gmail.com>
Date: Thu, 24 Jul 2025 15:44:01 -0700
Subject: [PATCH 3/5] Set is_available_online=False for Llama4ForCausalLM

Signed-off-by: Patrick Li <patrick8289@gmail.com>
---
 tests/models/registry.py               | 2 ++
 vllm/model_executor/models/registry.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index 84ca0bc60003..bba471a77c3d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -218,6 +218,8 @@ def check_available_online(
                                                 "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}),  # noqa: E501
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
+    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
+                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
     "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2aaac7798fc0..55f33d8704e0 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -88,6 +88,7 @@
     "JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
     "JambaForCausalLM": ("jamba", "JambaForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),

From 2adaea58cf47924c1752b00279c580212e01e2d4 Mon Sep 17 00:00:00 2001
From: Patrick Li <patrick8289@gmail.com>
Date: Fri, 25 Jul 2025 10:18:27 -0700
Subject: [PATCH 4/5] Makes sure multimodal_embeddings is not an empty list in
 get_input_embeddings for text only inputs for v0 path

Signed-off-by: Patrick Li <patrick8289@gmail.com>
---
 vllm/model_executor/models/ultravox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index a7a5ca8c2311..c844c1142a17 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -577,7 +577,7 @@ def get_input_embeddings(
         safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
         inputs_embeds = self.language_model.get_input_embeddings(
             safe_input_ids)
-        if multimodal_embeddings is not None:
+        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1:

From ecaee32ae0bfc22140644b54904509075ffc5112 Mon Sep 17 00:00:00 2001
From: Patrick Li <patrick8289@gmail.com>
Date: Fri, 25 Jul 2025 11:08:01 -0700
Subject: [PATCH 5/5] Formatting

Signed-off-by: Patrick Li <patrick8289@gmail.com>
---
 vllm/model_executor/models/ultravox.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index c844c1142a17..a4569ccd5a84 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -577,7 +577,8 @@ def get_input_embeddings(
         safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0
         inputs_embeds = self.language_model.get_input_embeddings(
             safe_input_ids)
-        if multimodal_embeddings is not None and len(multimodal_embeddings) > 0:
+        if multimodal_embeddings is not None and len(
+                multimodal_embeddings) > 0:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1: