From 1cb823d078ec4825eb12ca16eb961e8e54ea4b66 Mon Sep 17 00:00:00 2001 From: Farzad Abdolhosseini Date: Wed, 7 May 2025 14:43:46 -0700 Subject: [PATCH 1/5] audio token fixes for llama 4 and gemma 3 support (squashed) Signed-off-by: Farzad Abdolhosseini --- vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/ultravox.py | 37 ++++++++++++--------- vllm/transformers_utils/config.py | 2 +- vllm/transformers_utils/configs/ultravox.py | 22 +++++++----- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index faeaf6ef68cc..55b7a43d5f25 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -77,6 +77,7 @@ "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 94f5e03fd446..fa262c349134 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -39,9 +39,7 @@ merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) -_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" -_AUDIO_PLACEHOLDER_TOKEN = 128002 -_AUDIO_TOKENS_PER_SECOND = 6.25 +_AUDIO_PLACEHOLDER_OVERRIDE = "<|audio|>" _MAX_ENCODER_BATCH_SIZE = 16 @@ -80,14 +78,15 @@ def get_hf_processor( sampling_rate: Optional[int] = None, **kwargs: object, ) -> ProcessorMixin: + config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) # NOTE: Ultravox processing definition uses '<|eot_id|>' as the # placeholder that will cause confusion with the actual end of turn - # token, thus we override placeholder with a reserved special - # token. + # token, thus we override placeholder with a reserved token. hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE - hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN + hf_processor.audio_replacement_token_id = config.audio_token_index + return hf_processor def get_feature_extractor( @@ -268,7 +267,7 @@ def __init__(self, config: UltravoxConfig): else: self.act = get_act_fn(config.projector_act) - dim_out = config.text_config.hidden_size + dim_out = config.text_hidden_size self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False) # Ultravox v0.4.1 and below use layer_norm after the second linear layer @@ -559,9 +558,13 @@ def get_input_embeddings( input_ids: torch.Tensor, multimodal_embeddings: Optional[MultiModalEmbeddings] = None, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if multimodal_embeddings is not None \ - and len(multimodal_embeddings) != 0: + # The audio token index is not included in the embedding table + # We need to remove it before embedding lookup + safe_input_ids = input_ids.clone() + safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 + inputs_embeds = self.language_model.get_input_embeddings( + safe_input_ids) + if multimodal_embeddings is not None: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: @@ -572,7 +575,7 @@ def get_input_embeddings( else: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - _AUDIO_PLACEHOLDER_TOKEN) + self.config.audio_token_index) return inputs_embeds def forward(self, @@ -610,10 +613,14 @@ def forward(self, multimodal_embeddings) input_ids = None - hidden_states = self.language_model.model(input_ids, - positions, - intermediate_tensors, - inputs_embeds=inputs_embeds) + language_model = self.language_model + if hasattr(language_model, "language_model"): + language_model = language_model.language_model + + hidden_states = language_model.model(input_ids, + positions, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 52a7a903cd8e..c124f8b27f69 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -800,7 +800,7 @@ def get_hf_text_config(config: PretrainedConfig): # thinker_config.text_config. return config.thinker_config.text_config - text_config = config.get_text_config() + text_config = config.get_text_config().get_text_config() if text_config is not config: # The code operates under the assumption that text_config should have diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 62f63b02d49a..87064cc12ded 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -45,6 +45,7 @@ class UltravoxConfig(transformers.PretrainedConfig): """ model_type = "ultravox" + audio_token = "<|audio|>" is_composition = False def __init__( @@ -80,29 +81,32 @@ def __init__( # Avoid circular import from vllm.transformers_utils.config import get_config - self.text_config = get_config(text_model_id, - trust_remote_code=False) + text_config_obj = get_config(text_model_id, + trust_remote_code=False) else: text_config = text_config or {} - self.text_config = transformers.CONFIG_MAPPING[text_config.get( + text_config_obj = transformers.CONFIG_MAPPING[text_config.get( "model_type", "llama")](**text_config) + inner_text_config = text_config_obj.get_text_config() + if audio_model_id is not None: # Avoid circular import from vllm.transformers_utils.config import get_config - self.audio_config = get_config(audio_model_id, - trust_remote_code=False) + audio_config = get_config(audio_model_id, trust_remote_code=False) else: audio_config = audio_config or {} - self.audio_config = transformers.CONFIG_MAPPING[audio_config.get( + audio_config = transformers.CONFIG_MAPPING[audio_config.get( "model_type", "whisper")](**audio_config) + self.text_config = text_config_obj + self.audio_config = audio_config self.text_model_lora_config = text_model_lora_config or {} self.audio_model_lora_config = audio_model_lora_config or {} - self.vocab_size = self.text_config.vocab_size - - self.initializer_range = self.text_config.initializer_range + self.vocab_size = inner_text_config.vocab_size + self.initializer_range = inner_text_config.initializer_range + self.text_hidden_size = inner_text_config.hidden_size super().__init__(**kwargs) From fb573620999b57d8ac777f4375c4081bd4fd6027 Mon Sep 17 00:00:00 2001 From: Patrick Li Date: Wed, 23 Jul 2025 12:05:27 -0700 Subject: [PATCH 2/5] Remove Llama4ForCausalLM from model registry Signed-off-by: Patrick Li --- vllm/model_executor/models/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 55f33d8704e0..2aaac7798fc0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -88,7 +88,6 @@ "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), From 02c9ca8cdc7d6b78cdcfa9d873db64bcb9b1d0c6 Mon Sep 17 00:00:00 2001 From: Patrick Li Date: Thu, 24 Jul 2025 15:44:01 -0700 Subject: [PATCH 3/5] Set is_available_online=False for Llama4ForCausalLM Signed-off-by: Patrick Li --- tests/models/registry.py | 2 ++ vllm/model_executor/models/registry.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/models/registry.py b/tests/models/registry.py index 84ca0bc60003..bba471a77c3d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -218,6 +218,8 @@ def check_available_online( "fp8": "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"}), # noqa: E501 "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), + "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 + is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2aaac7798fc0..55f33d8704e0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -88,6 +88,7 @@ "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), + "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"), # noqa: E501 # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), From 2adaea58cf47924c1752b00279c580212e01e2d4 Mon Sep 17 00:00:00 2001 From: Patrick Li Date: Fri, 25 Jul 2025 10:18:27 -0700 Subject: [PATCH 4/5] Makes sure multimodal_embeddings is not an empty list in get_input_embeddings for text only inputs for v0 path Signed-off-by: Patrick Li --- vllm/model_executor/models/ultravox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index a7a5ca8c2311..c844c1142a17 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -577,7 +577,7 @@ def get_input_embeddings( safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 inputs_embeds = self.language_model.get_input_embeddings( safe_input_ids) - if multimodal_embeddings is not None: + if multimodal_embeddings is not None and len(multimodal_embeddings) > 0: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: From ecaee32ae0bfc22140644b54904509075ffc5112 Mon Sep 17 00:00:00 2001 From: Patrick Li Date: Fri, 25 Jul 2025 11:08:01 -0700 Subject: [PATCH 5/5] Formatting Signed-off-by: Patrick Li --- vllm/model_executor/models/ultravox.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c844c1142a17..a4569ccd5a84 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -577,7 +577,8 @@ def get_input_embeddings( safe_input_ids[safe_input_ids == self.config.audio_token_index] = 0 inputs_embeds = self.language_model.get_input_embeddings( safe_input_ids) - if multimodal_embeddings is not None and len(multimodal_embeddings) > 0: + if multimodal_embeddings is not None and len( + multimodal_embeddings) > 0: # TODO(ywang96): remove this block after v0 is deprecated. if not envs.VLLM_USE_V1: