From ae475194d39104248c081a6ba0cbd7e5af5d2202 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 25 Mar 2025 23:24:30 +0800
Subject: [PATCH 01/36] init phi4mm multimodal processor

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 314 +++++++++++++++++++++++++--
 1 file changed, 290 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 3d4505d556e2..37a0c30c9bbc 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
 import re
+from collections.abc import Iterable, Mapping, Sequence
 from functools import lru_cache
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -11,7 +12,7 @@
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig
+from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature
 from transformers.utils import logging
 
 from vllm.config import VllmConfig
@@ -28,7 +29,14 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors, MultiModalInputs)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
@@ -121,8 +129,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
     return best_ratio
 
 
-def _find_target_aspect_ratio(image, image_size, max_num, min_num):
-    orig_width, orig_height = image.size
+def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,):
 
     w_crop_num = math.ceil(orig_width / float(image_size))
     h_crop_num = math.ceil(orig_height / float(image_size))
@@ -150,8 +157,7 @@ def _find_target_aspect_ratio(image, image_size, max_num, min_num):
     return target_aspect_ratio, target_height, target_width
 
 
-def _get_padding_size(image, target_height, target_width):
-    orig_width, orig_height = image.size
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -169,14 +175,14 @@ def dynamic_preprocess(image,
                        max_num=12,
                        image_size=384,
                        mask_size=27):
+    orig_width, orig_height = image.size
     target_aspect_ratio, target_height, target_width =\
           _find_target_aspect_ratio(
-        image, image_size, max_num, min_num)
+        orig_width, orig_height, image_size, max_num, min_num)
     padding_height, padding_width = _get_padding_size(image, target_height,
                                                       target_width)
 
     # Calculate the ratio
-    orig_width, orig_height = image.size
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
     if ratio_width < ratio_height:
@@ -858,8 +864,14 @@ def audio_feature_extractor() -> LogFbankProcessor:
     return LogFbankProcessor()
 
 
-def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                              vit_patch_size, token_compression_factor):
+def _compute_num_image_tokens(
+    orig_width: int,
+    orig_height: int,
+    dynamic_hd_size: int,
+    vit_image_size: int,
+    vit_patch_size: int,
+    token_compression_factor: int = 2,
+):
     """
     compute the number of tokens an image is expected to take up considering 
     the image encoder architecture and exclude output features containing 
@@ -876,7 +888,8 @@ def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
             "token_compression_factor"
 
     target_aspect_ratio, target_height, target_width = (
-        _find_target_aspect_ratio(image,
+        _find_target_aspect_ratio(orig_width,
+                                  orig_height,
                                   vit_image_size,
                                   dynamic_hd_size,
                                   min_num=1))
@@ -889,7 +902,7 @@ def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
     assert (target_height % vit_image_size == 0
             and target_width % vit_image_size == 0)
 
-    padding_height, padding_width = _get_padding_size(image, target_height,
+    padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height,
                                                       target_width)
     assert padding_width == 0 or padding_height == 0, \
         "padding_width or padding_height must be 0"
@@ -1218,7 +1231,7 @@ def input_processor_for_phi4mm(ctx: InputContext,
     )
 
 
-def _compute_audio_embed_size(hf_config, audio_frames):
+def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int:
     """
     Compute the audio embedding size based on the audio frames and
     compression rate.
@@ -1423,16 +1436,269 @@ def cat_with_pad(tensors, dim, padding_value=0):
     return output
 
 
-@MULTIMODAL_REGISTRY.register_input_mapper("audio",
-                                           input_mapper_for_phi4mm_audio)
-@MULTIMODAL_REGISTRY.register_input_mapper("image",
-                                           input_mapper_for_phi4mm_image)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_phi4mm_audio_tokens)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "image", get_max_phi4mm_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
+class Phi4MMProcessingInfo(BaseProcessingInfo):
+
+    @property
+    def image_tokens(self) -> list[str]:
+        return [f"<|image_{i+1}|>" for i in range(100)]
+
+    @property
+    def audio_tokens(self) -> list[str]:
+        return [f"<|audio_{i+1}|>" for i in range(100)]
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "audio": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {
+            "image": self.get_max_image_tokens(),
+            "audio": self.get_max_audio_tokens(),
+        }
+
+    def get_max_audio_tokens(self) -> int:
+        return 10000
+    
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width, image_height=target_height)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        dynamic_hd_size = prepro_config['dynamic_hd']
+        vit_image_size = prepro_config['vit_image_size']
+        vit_patch_size = prepro_config['vit_patch_size']
+        token_compression_factor = prepro_config['token_compression_factor']
+
+        image_num_tokens = _compute_num_image_tokens(
+            image_width, image_height, 
+            dynamic_hd_size=dynamic_hd_size,
+            vit_image_size=vit_image_size,
+            vit_patch_size=vit_patch_size,
+            token_compression_factor=token_compression_factor,
+        )
+
+        return image_num_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        vision_encoder_name = hf_config.img_processor
+        if vision_encoder_name is None:
+            vision_encoder_name = SIGLIP_NAME
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        dynamic_hd_size = prepro_config['dynamic_hd']
+        vit_image_size = prepro_config['vit_image_size']
+
+        max_side = vit_image_size * dynamic_hd_size
+        return ImageSize(height=max_side, width=vit_image_size)
+
+    def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
+                              vit_patch_size, token_compression_factor):
+        """
+        compute the number of tokens an image is expected to take up considering 
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, \
+            "vit_image_size must be divisible by vit_patch_size"
+        assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
+            "vit_image_size // vit_patch_size must be divisible by "\
+                "token_compression_factor"
+
+        target_aspect_ratio, target_height, target_width = (
+            _find_target_aspect_ratio(image,
+                                    vit_image_size,
+                                    dynamic_hd_size,
+                                    min_num=1))
+        assert target_aspect_ratio[
+            0] * vit_image_size == target_width, \
+                f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+        assert target_aspect_ratio[
+            1] * vit_image_size == target_height, \
+                f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(image, target_height,
+                                                        target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
+                num_hd_newline_tokens + num_global_image_newline_tokens)
+
+
+class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                            num_audios=num_audios),
+        }
+
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+
+        return ProcessorInputs(
+            prompt_text="".join(image_tokens + audio_tokens),
+            mm_data=mm_data,
+        )
+
+
+class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_sizes=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_tokens: list[str] = self.info.image_tokens  # type: ignore
+        audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+
+        tokenizer = self.info.get_tokenizer()
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        def get_image_replacement_phi4mm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                num_image_tokens = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens
+
+            return image_tokens
+
+        def get_audio_replacement_phi4mm(item_idx: int):
+            audios = mm_items.get_items("audio", AudioProcessorItems)
+            # TODO(Isotr0py): support embedding inputs
+            audio_len = audios.get_audio_length(item_idx)
+            audio_frames, _ = compute_logfbank_output_size(audio_len, DUMMY_SAMPLING_FREQUENCY)
+            audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames)
+
+            audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+
+            return audio_tokens
+
+        num_images = mm_items.get_count("image", strict=False)
+        num_audios = mm_items.get_count("audio", strict=False)
+
+        image_repl = [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_image_replacement_phi4mm,
+            ) for image_token in image_tokens[:num_images]
+        ]
+        audio_repl = [
+            PromptReplacement(
+                modality="image",
+                target=audio_token,
+                replacement=get_audio_replacement_phi4mm,
+            ) for audio_token in audio_tokens[:num_audios]
+        ]
+        return image_repl + audio_repl
+
+
+# @MULTIMODAL_REGISTRY.register_input_mapper("audio",
+#                                            input_mapper_for_phi4mm_audio)
+# @MULTIMODAL_REGISTRY.register_input_mapper("image",
+#                                            input_mapper_for_phi4mm_image)
+# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+#     "audio", get_max_phi4mm_audio_tokens)
+# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+#     "image", get_max_phi4mm_image_tokens)
+# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
+# @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
+@MULTIMODAL_REGISTRY.register_processor(
+    Phi4MMMultiModalProcessor,
+    info=Phi4MMProcessingInfo,
+    dummy_inputs=Phi4MMDummyInputsBuilder,
+)
 class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
                         SupportsV0Only):
     """

From 1a3e9c5825d0065bde08f7c0f5cc22bb1d60eea3 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 25 Mar 2025 23:28:07 +0800
Subject: [PATCH 02/36] remove unused func

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 71 ----------------------------
 1 file changed, 71 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 37a0c30c9bbc..36f4e16a5412 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1505,77 +1505,6 @@ def get_image_size_with_most_features(self) -> ImageSize:
         max_side = vit_image_size * dynamic_hd_size
         return ImageSize(height=max_side, width=vit_image_size)
 
-    def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                              vit_patch_size, token_compression_factor):
-        """
-        compute the number of tokens an image is expected to take up considering 
-        the image encoder architecture and exclude output features containing 
-        only padding pixels
-
-        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
-        32x32 feature map
-        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-        """
-        assert vit_image_size % vit_patch_size == 0, \
-            "vit_image_size must be divisible by vit_patch_size"
-        assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-            "vit_image_size // vit_patch_size must be divisible by "\
-                "token_compression_factor"
-
-        target_aspect_ratio, target_height, target_width = (
-            _find_target_aspect_ratio(image,
-                                    vit_image_size,
-                                    dynamic_hd_size,
-                                    min_num=1))
-        assert target_aspect_ratio[
-            0] * vit_image_size == target_width, \
-                f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-        assert target_aspect_ratio[
-            1] * vit_image_size == target_height, \
-                f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-        assert (target_height % vit_image_size == 0
-                and target_width % vit_image_size == 0)
-
-        padding_height, padding_width = _get_padding_size(image, target_height,
-                                                        target_width)
-        assert padding_width == 0 or padding_height == 0, \
-            "padding_width or padding_height must be 0"
-
-        target_feat_width = target_width // vit_patch_size
-        target_feat_height = target_height // vit_patch_size
-        if padding_width >= vit_patch_size:
-            assert padding_height == 0, "padding_height not 0"
-            non_pad_feat_width = target_feat_width - math.floor(
-                padding_width / vit_patch_size)
-            non_pad_feat_height = target_feat_height
-        elif padding_height >= vit_patch_size:
-            assert padding_width == 0, "padding_width not 0"
-            non_pad_feat_height = target_feat_height - math.floor(
-                padding_height / vit_patch_size)
-            non_pad_feat_width = target_feat_width
-        else:
-            # small padding shorter than a vit patch
-            non_pad_feat_width = target_feat_width
-            non_pad_feat_height = target_feat_height
-
-        feat_width = non_pad_feat_width // token_compression_factor
-        feat_height = non_pad_feat_height // token_compression_factor
-        # NOTE it's possible that the non-padding feature is not divisible
-        if non_pad_feat_width % token_compression_factor != 0:
-            feat_width += 1
-        if non_pad_feat_height % token_compression_factor != 0:
-            feat_height += 1
-        num_hd_patch_tokens = feat_width * feat_height
-        num_hd_newline_tokens = feat_height
-        vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
-        num_sep_tokens = 1
-        num_global_image_newline_tokens = \
-            vit_feature_size // token_compression_factor
-
-        return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-                num_hd_newline_tokens + num_global_image_newline_tokens)
-
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
 

From 41e00f6240cad5c317c77b01c00ee51bd2a2bd79 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 27 Mar 2025 00:35:09 +0800
Subject: [PATCH 03/36] make image inference work

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 34 ++++++++++++++++++----------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 36f4e16a5412..8588750e144a 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1447,7 +1447,7 @@ def audio_tokens(self) -> list[str]:
         return [f"<|audio_{i+1}|>" for i in range(100)]
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "audio": None}
+        return {"image": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -1456,7 +1456,7 @@ def get_mm_max_tokens_per_item(
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
+            # "audio": self.get_max_audio_tokens(),
         }
 
     def get_max_audio_tokens(self) -> int:
@@ -1513,7 +1513,7 @@ def get_dummy_processor_inputs(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        num_audios = mm_counts.get("audio", 0)
+        # num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
@@ -1524,21 +1524,36 @@ def get_dummy_processor_inputs(
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images),
-            "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                                            num_audios=num_audios),
+            # "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+            #                                 num_audios=num_audios),
         }
 
         image_tokens: list[str] = self.info.image_tokens[:num_images]
-        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+        # audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
 
         return ProcessorInputs(
-            prompt_text="".join(image_tokens + audio_tokens),
+            prompt_text="".join(image_tokens),
             mm_data=mm_data,
         )
 
 
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
+        else:
+            tokenizer = self.info.get_tokenizer()
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+        return processed_outputs
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -1560,10 +1575,6 @@ def _get_prompt_updates(
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
 
-        tokenizer = self.info.get_tokenizer()
-        bos_token_id = tokenizer.bos_token_id
-        assert isinstance(bos_token_id, int)
-
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
                 "image", (ImageEmbeddingItems, ImageProcessorItems))
@@ -1575,7 +1586,6 @@ def get_image_replacement_phi4mm(item_idx: int):
                 num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
-                    processor=hf_processor,
                 )
 
             image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens

From 373d0a82aeedda788411857827e10f2b896c103d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 27 Mar 2025 15:37:08 +0800
Subject: [PATCH 04/36] image work

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 8588750e144a..fc9770de3320 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1547,6 +1547,12 @@ def _call_hf_processor(
     ) -> BatchFeature:
         if mm_data:
             processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
+            num_img_tokens = [
+                self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1])
+                for img_size in processed_outputs["image_sizes"]
+            ]
+            processed_outputs["num_img_tokens"] = num_img_tokens
+            processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds')
         else:
             tokenizer = self.info.get_tokenizer()
             processed_outputs = tokenizer(prompt,
@@ -1561,8 +1567,9 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
+            image_attention_mask=MultiModalFieldConfig.batched("image"),
             image_sizes=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
+            num_img_tokens=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(

From a3f972596e0f185d1fab4216fbe7fdd85118f7e6 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 27 Mar 2025 23:46:21 +0800
Subject: [PATCH 05/36] fix multi images

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index fc9770de3320..02b03fb1db89 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1445,6 +1445,11 @@ def image_tokens(self) -> list[str]:
     @property
     def audio_tokens(self) -> list[str]:
         return [f"<|audio_{i+1}|>" for i in range(100)]
+    
+    @property
+    def dynamic_hd(self) -> int:
+        image_processor = self.get_hf_processor().image_processor
+        return image_processor.dynamic_hd
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -1478,11 +1483,12 @@ def get_num_image_tokens(
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
         prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-        dynamic_hd_size = prepro_config['dynamic_hd']
         vit_image_size = prepro_config['vit_image_size']
         vit_patch_size = prepro_config['vit_patch_size']
         token_compression_factor = prepro_config['token_compression_factor']
 
+        dynamic_hd_size = self.dynamic_hd
+
         image_num_tokens = _compute_num_image_tokens(
             image_width, image_height, 
             dynamic_hd_size=dynamic_hd_size,
@@ -1499,10 +1505,9 @@ def get_image_size_with_most_features(self) -> ImageSize:
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
         prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-        dynamic_hd_size = prepro_config['dynamic_hd']
         vit_image_size = prepro_config['vit_image_size']
 
-        max_side = vit_image_size * dynamic_hd_size
+        max_side = vit_image_size * self.dynamic_hd
         return ImageSize(height=max_side, width=vit_image_size)
 
 
@@ -1578,7 +1583,6 @@ def _get_prompt_updates(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
-        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
 

From 83ce87c2fb97a6998a696b531a8d9147443b3303 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 28 Mar 2025 00:10:48 +0800
Subject: [PATCH 06/36] init v1

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 64 ++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 02b03fb1db89..d1df03f2d774 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -41,9 +41,9 @@
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
+from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -1649,8 +1649,7 @@ def get_audio_replacement_phi4mm(item_idx: int):
     info=Phi4MMProcessingInfo,
     dummy_inputs=Phi4MMDummyInputsBuilder,
 )
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
-                        SupportsV0Only):
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """
@@ -1930,6 +1929,63 @@ def merge_image_features_to_inputs_embeds(
         )
         return merged_embeds
 
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("audio_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+    
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        audio_projection_mode = 'speech'
+        for modality in modalities:
+            # make sure process images first
+            if modality == "images":
+                audio_projection_mode = "vision"
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            # if modality == "audios":
+            #     audio_input = modalities["audios"]
+            #     audio_embeddings = self._process_audio_input(audio_input)
+            #     multimodal_embeddings += audio_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,

From 20fa915b1bd7682a22deb2288e647946067bb45a Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 28 Mar 2025 01:07:19 +0800
Subject: [PATCH 07/36] v1 image work

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 150 ++++++++++++++++++---------
 1 file changed, 103 insertions(+), 47 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index d1df03f2d774..d509b5003131 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -22,7 +22,7 @@
 from vllm.inputs.data import TokenInputs, token_inputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
@@ -39,6 +39,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.utils import is_list_of
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings
@@ -498,7 +499,7 @@ def get_img_features(self,
 
     def forward(self, pixel_values: torch.FloatTensor,
                 image_sizes: torch.Tensor,
-                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+                image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]:
         """
         process image and return vision embeddings.
 
@@ -667,6 +668,40 @@ def forward(self, pixel_values: torch.FloatTensor,
         return img_set_tensor
 
 
+class Phi4MMImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    Shape:
+    `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    image_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+    num_img_tokens: list[int]
+    """Shape: `(batch_size * num_images)`"""
+
+    image_attention_mask: torch.Tensor
+    """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
+
+
+class Phi4MMImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
 class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: Tuple[NestedTensors]
@@ -679,6 +714,7 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
     """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
 
 
+Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
 
@@ -1733,7 +1769,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def _audio_features_to_embeddings(
         self,
@@ -1848,7 +1884,7 @@ def _process_audio_input(self, input_ids: torch.Tensor,
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
-        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
+        pixel_values: NestedTensors = kwargs.get("pixel_values")
         if pixel_values is None:
             return None
 
@@ -1858,8 +1894,8 @@ def _parse_and_validate_image_input(self,
         assert image_sizes is not None and image_attention_mask is not None\
               and num_img_tokens is not None, "Missing image inputs"
 
-        if isinstance(pixel_values, list):
-            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+        if is_list_of(pixel_values, torch.Tensor):
+            assert all(p.dim() == 5 for p in pixel_values), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
@@ -1900,12 +1936,13 @@ def _parse_and_validate_image_input(self,
         else:
             raise ValueError("Incorrect image_attention_mask inputs")
 
-        return {
-            'pixel_values': pixel_values,
-            'image_sizes': image_sizes,
-            'image_attention_mask': image_attention_mask,
-            'num_img_tokens': num_img_tokens,
-        }
+        return Phi4MMImagePixelInputs(
+            type="pixel_values_videos",
+            data=pixel_values,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            num_img_tokens=num_img_tokens,
+        )
 
     def merge_image_features_to_inputs_embeds(
         self,
@@ -1946,6 +1983,18 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         return modalities
     
+    def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            dtype = next(self.vision_encoder.parameters()).dtype
+            pixel_values = image_input['data'].to(dtype)
+            image_sizes = image_input['image_sizes']
+            image_attention_mask = image_input['image_attention_mask']
+            image_embeds = self.vision_encoder(
+                pixel_values, image_sizes, image_attention_mask)
+        return image_embeds
+    
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
@@ -1966,7 +2015,7 @@ def get_multimodal_embeddings(
                 audio_projection_mode = "vision"
                 image_input = modalities["images"]
                 vision_embeddings = self._process_image_input(image_input)
-                multimodal_embeddings += vision_embeddings
+                multimodal_embeddings += tuple(vision_embeddings)
             # if modality == "audios":
             #     audio_input = modalities["audios"]
             #     audio_embeddings = self._process_audio_input(audio_input)
@@ -1985,52 +2034,59 @@ def get_input_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
                 [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
         return inputs_embeds
+    
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[Phi4MMImagePixelInputs] = None,
+        audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
+            )
+
+        # if audio_input is not None:
+        #     audio_embeds = self._process_audio_input(audio_input)
+        #     inputs_embeds = merge_multimodal_embeddings(
+        #         input_ids,
+        #         inputs_embeds,
+        #         audio_embeds,
+        #         placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+        #     )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            input_ids = None
             inputs_embeds = None
-        else:
-            # Each entry in this is a pair of audio_features and audio_embed
-            # lengths
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
             audio_input = self._parse_and_validate_audio_input(**kwargs)
-            image_inputs = self._parse_and_validate_image_input(**kwargs)
-
-            has_audio = audio_input is not None
-            has_image = image_inputs is not None
-
-            if has_audio:
-                audio_projection_mode = 'vision' if has_image else 'speech'
-                inputs_embeds = self._process_audio_input(
-                    input_ids, audio_input, audio_projection_mode)
-
-            if has_image:
-                dtype = self.vision_encoder.img_processor.embeddings.\
-                    patch_embedding.weight.dtype
-                pixel_values = image_inputs['pixel_values'].to(dtype)
-                image_sizes = image_inputs['image_sizes']
-                image_attention_mask = image_inputs['image_attention_mask']
-                image_set_tensors = self.vision_encoder(
-                    pixel_values, image_sizes, image_attention_mask)
-                if not has_audio:
-                    inputs_embeds = self.model.embed_tokens(input_ids)
-
-                inputs_embeds = self.merge_image_features_to_inputs_embeds(
-                    input_ids, inputs_embeds, image_set_tensors)
-
-            if has_image or has_audio:
-                # multi-modal input, we have set inputs_embeds properly in
-                # previous steps
-                input_ids = None
-            else:
-                # text-only, we keep using original input_ids
+
+            if image_input is None and audio_input is None:
                 inputs_embeds = None
+            else:
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    audio_input=audio_input)
+                input_ids = None
 
         hidden_states = self.model(
             input_ids,

From 6feca07a9dd0fdb2a9c452d15625d496de1bb608 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 00:55:02 +0800
Subject: [PATCH 08/36] make audio run

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 85 ++++++++++++++--------------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index d509b5003131..d4c06f98174c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -12,7 +12,7 @@
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature
+from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature, SequenceFeatureExtractor
 from transformers.utils import logging
 
 from vllm.config import VllmConfig
@@ -31,7 +31,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors, MultiModalInputs)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataParser,
                                    ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -1486,9 +1486,12 @@ def audio_tokens(self) -> list[str]:
     def dynamic_hd(self) -> int:
         image_processor = self.get_hf_processor().image_processor
         return image_processor.dynamic_hd
+    
+    def get_feature_extractor(self) -> SequenceFeatureExtractor:
+        return self.get_hf_processor().audio_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
+        return {"audio": None, "image": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -1497,11 +1500,11 @@ def get_mm_max_tokens_per_item(
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            # "audio": self.get_max_audio_tokens(),
+            "audio": self.get_max_audio_tokens(),
         }
 
     def get_max_audio_tokens(self) -> int:
-        return 10000
+        return 188
     
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1554,7 +1557,7 @@ def get_dummy_processor_inputs(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        # num_audios = mm_counts.get("audio", 0)
+        num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
         target_width, target_height = \
@@ -1565,21 +1568,25 @@ def get_dummy_processor_inputs(
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images),
-            # "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
-            #                                 num_audios=num_audios),
+            "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                            num_audios=num_audios),
         }
 
         image_tokens: list[str] = self.info.image_tokens[:num_images]
-        # audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
 
         return ProcessorInputs(
-            prompt_text="".join(image_tokens),
+            prompt_text="".join(image_tokens+audio_tokens),
             mm_data=mm_data,
         )
 
 
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -1587,6 +1594,9 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         if mm_data:
+            if "audios" in mm_data:
+                sr = self.info.get_feature_extractor().sampling_rate
+                mm_data['audios'] = [(data, sr) for data in mm_data['audios']]
             processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
             num_img_tokens = [
                 self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1])
@@ -1611,6 +1621,9 @@ def _get_mm_fields_config(
             image_attention_mask=MultiModalFieldConfig.batched("image"),
             image_sizes=MultiModalFieldConfig.batched("image"),
             num_img_tokens=MultiModalFieldConfig.batched("image"),
+            input_audio_embeds=MultiModalFieldConfig.batched("audio"),
+            audio_embed_sizes=MultiModalFieldConfig.batched("audio"),
+            audio_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_updates(
@@ -1662,7 +1675,7 @@ def get_audio_replacement_phi4mm(item_idx: int):
         ]
         audio_repl = [
             PromptReplacement(
-                modality="image",
+                modality="audio",
                 target=audio_token,
                 replacement=get_audio_replacement_phi4mm,
             ) for audio_token in audio_tokens[:num_audios]
@@ -1872,15 +1885,21 @@ def _process_audio_input(self, input_ids: torch.Tensor,
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
         audio_feature = [i[0] for j in audio_features for i in j]
-        audio_feature_len = [i[1].item() for j in audio_features for i in j]
+        # audio_feature_len = [i[1].item() for j in audio_features for i in j]
         # Add the batch dim via `squeeze`
 
-        return self._audio_features_to_embeddings(
-            input_ids.unsqueeze(0),
-            audio_feature,
-            audio_feature_len,
-            audio_projection_mode,
-        ).squeeze(0)
+        # return self._audio_features_to_embeddings(
+        #     input_ids.unsqueeze(0),
+        #     audio_feature,
+        #     audio_feature_len,
+        #     audio_projection_mode,
+        # ).squeeze(0)
+        audio_set_tensor = [
+            self.embed_tokens_extend.get_audio_features(
+                audio_feature, audio_projection_mode=audio_projection_mode)
+            for audio_feature in audio_feature
+        ]
+        return audio_set_tensor
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
@@ -1944,28 +1963,6 @@ def _parse_and_validate_image_input(self,
             num_img_tokens=num_img_tokens,
         )
 
-    def merge_image_features_to_inputs_embeds(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: torch.Tensor,
-        image_set_tensors: List[torch.Tensor],
-    ):
-        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
-            as_tuple=True)
-
-        assert all([t.shape[0] == 1 for t in image_set_tensors
-                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
-        # Shape: (merged_N_tokens, C)
-        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
-        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
-            inputs_embeds.device)
-        merged_embeds = inputs_embeds.index_put(
-            indices=position_tuple,
-            values=image_set_tensor,
-            accumulate=False,
-        )
-        return merged_embeds
-
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
@@ -2016,10 +2013,10 @@ def get_multimodal_embeddings(
                 image_input = modalities["images"]
                 vision_embeddings = self._process_image_input(image_input)
                 multimodal_embeddings += tuple(vision_embeddings)
-            # if modality == "audios":
-            #     audio_input = modalities["audios"]
-            #     audio_embeddings = self._process_audio_input(audio_input)
-            #     multimodal_embeddings += audio_embeddings
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_embeddings = self._process_audio_input(audio_input)
+                multimodal_embeddings += audio_embeddings
 
         return multimodal_embeddings
 

From 70478c8815af5ef38f604d15a06cff11f52fb878 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 15:21:41 +0800
Subject: [PATCH 09/36] fix

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index d4c06f98174c..a9770debcdf5 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1838,7 +1838,7 @@ def _parse_and_validate_audio_input(
         Returns:
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
-        audio_features = kwargs.pop("audio_features", None)
+        audio_features = kwargs.pop("input_audio_embeds", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
@@ -1862,7 +1862,7 @@ def _parse_and_validate_audio_input(
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self, input_ids: torch.Tensor,
+    def _process_audio_input(self,
                              audio_input: Phi4MMAudioInputs,
                              audio_projection_mode: str) -> NestedTensors:
         """
@@ -2015,7 +2015,7 @@ def get_multimodal_embeddings(
                 multimodal_embeddings += tuple(vision_embeddings)
             if modality == "audios":
                 audio_input = modalities["audios"]
-                audio_embeddings = self._process_audio_input(audio_input)
+                audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode)
                 multimodal_embeddings += audio_embeddings
 
         return multimodal_embeddings
@@ -2038,6 +2038,7 @@ def get_input_embeddings_v0(
         image_input: Optional[Phi4MMImagePixelInputs] = None,
         audio_input: Optional[Phi4MMAudioFeatureInputs] = None,
     ) -> torch.Tensor:
+        audio_projection_mode = 'speech'
         inputs_embeds = self.get_input_embeddings(input_ids)
         if image_input is not None:
             image_embeds = self._process_image_input(image_input)
@@ -2047,15 +2048,16 @@ def get_input_embeddings_v0(
                 image_embeds,
                 placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID,
             )
+            audio_projection_mode = 'vision'
 
-        # if audio_input is not None:
-        #     audio_embeds = self._process_audio_input(audio_input)
-        #     inputs_embeds = merge_multimodal_embeddings(
-        #         input_ids,
-        #         inputs_embeds,
-        #         audio_embeds,
-        #         placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
-        #     )
+        if audio_input is not None:
+            audio_embeds = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                audio_embeds,
+                placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID,
+            )
         return inputs_embeds
 
     def forward(

From fbe07ff775091736422097c716b0c6780aac0924 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 16:30:24 +0800
Subject: [PATCH 10/36] fix audio correctness

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index a9770debcdf5..513bb959d8a4 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1871,8 +1871,6 @@ def _process_audio_input(self,
         created by `input_mapper_for_phi4mm_audio`.
 
         Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
-            before the audio token replication).
             audio_input (Phi4MMAudioInputs): Audio input.
 
         Returns:
@@ -1884,20 +1882,11 @@ def _process_audio_input(self,
         audio_features = audio_input["data"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
-        audio_feature = [i[0] for j in audio_features for i in j]
-        # audio_feature_len = [i[1].item() for j in audio_features for i in j]
-        # Add the batch dim via `squeeze`
-
-        # return self._audio_features_to_embeddings(
-        #     input_ids.unsqueeze(0),
-        #     audio_feature,
-        #     audio_feature_len,
-        #     audio_projection_mode,
-        # ).squeeze(0)
-        audio_set_tensor = [
-            self.embed_tokens_extend.get_audio_features(
-                audio_feature, audio_projection_mode=audio_projection_mode)
-            for audio_feature in audio_feature
+
+        dtype = next(self.embed_tokens_extend.parameters()).dtype
+        audio_set_tensor = [self.embed_tokens_extend.get_audio_features(
+                feature.to(dtype), audio_projection_mode=audio_projection_mode)
+            for feature in audio_features
         ]
         return audio_set_tensor
 

From 49fb233fab3f0fe2130a2b666692b3ebdfef6216 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 19:51:59 +0800
Subject: [PATCH 11/36] fix multi audios

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 34 ++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 513bb959d8a4..240422137bf4 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -44,7 +44,7 @@
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings, flatten_bn
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -705,7 +705,10 @@ class Phi4MMImageEmbeddingInputs(TypedDict):
 class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: Tuple[NestedTensors]
-    """Shape: `((batch_size, num_audios, 80, M), )"""
+    """Shape: `((batch_size * num_audios, 80, M), )"""
+
+    audio_embed_sizes: torch.Tensor
+    """Shape: `(batch_size * num_audios)`"""
 
 
 class Phi4MMAudioEmbeddingInputs(TypedDict):
@@ -1839,18 +1842,28 @@ def _parse_and_validate_audio_input(
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
         audio_features = kwargs.pop("input_audio_embeds", None)
+        audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
         if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
+            assert isinstance(audio_embed_sizes, torch.Tensor)
+            if isinstance(audio_features, torch.Tensor):
+                assert audio_features.size(0) == len(audio_embed_sizes), (
+                    "audio_features and audio_embed_sizes must have the same length")
+            elif is_list_of(audio_features, list):
+                assert len(audio_features) == len(audio_embed_sizes), (
+                    "audio_features and audio_embed_sizes must have the same length")
+            else:
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
 
+
             return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=audio_features)
+                                            data=flatten_bn(audio_features, concat=True),
+                                            audio_embed_sizes=flatten_bn(audio_embed_sizes, concat=True))
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -1880,15 +1893,18 @@ def _process_audio_input(self,
             return audio_input["data"]
 
         audio_features = audio_input["data"]
+        audio_sizes = audio_input["audio_embed_sizes"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
 
         dtype = next(self.embed_tokens_extend.parameters()).dtype
-        audio_set_tensor = [self.embed_tokens_extend.get_audio_features(
-                feature.to(dtype), audio_projection_mode=audio_projection_mode)
-            for feature in audio_features
-        ]
-        return audio_set_tensor
+        audio_padded_embeds = self.embed_tokens_extend.get_audio_features(
+            audio_features.to(dtype),
+            audio_projection_mode=audio_projection_mode,
+        )
+        audio_embeds = [audio_padded_embeds[idx, :size]
+                        for idx, size in enumerate(audio_sizes)]
+        return audio_embeds
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:

From 51dde9c4b6d8b11cc7b271342b36a0db40f8703f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 22:37:52 +0800
Subject: [PATCH 12/36] fix resampling

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 16 +++++++++++++++-
 vllm/multimodal/parse.py             |  3 ++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 240422137bf4..495d7738f22c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -7,6 +7,7 @@
                     TypedDict, Union)
 
 import numpy as np
+import numpy.typing as npt
 import scipy.signal
 import torch
 import torch.nn as nn
@@ -1587,8 +1588,21 @@ def get_dummy_processor_inputs(
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
+
+        def scipy_resample_audio(
+            audio: npt.NDArray[np.floating],
+            *,
+            orig_sr: float,
+            target_sr: float,
+        ):
+            if orig_sr > target_sr:
+                return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+            elif orig_sr < target_sr:
+                return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+            return audio
+
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, resample_func=scipy_resample_audio)
 
     def _call_hf_processor(
         self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 772b1609a9fb..8d723b6bef8f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -308,10 +308,11 @@ class MultiModalDataParser:
             items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None) -> None:
+    def __init__(self, *, target_sr: Optional[float] = None, resample_func: Optional[Callable] = None,) -> None:
         super().__init__()
 
         self.target_sr = target_sr
+        self.audio_resampler = resample_audio if resample_func is None else resample_func
 
     def _is_embeddings(
             self, data: object

From f0715817fb917ab57cd4f785ec947de338876166 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 30 Mar 2025 23:01:43 +0800
Subject: [PATCH 13/36] fix resampling

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/multimodal/parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 8d723b6bef8f..3f30ec646af0 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -368,7 +368,7 @@ def _parse_audio_data(
                         "Audio resampling is not supported when "
                         "`target_sr` is not provided")
 
-                new_audio = resample_audio(audio,
+                new_audio = self.audio_resampler(audio,
                                            orig_sr=orig_sr,
                                            target_sr=target_sr)
 

From d665855affd73aad2832757d4ad007b50b519287 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 31 Mar 2025 01:48:39 +0800
Subject: [PATCH 14/36] fix audio diff

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 495d7738f22c..a906247ad9f9 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -708,9 +708,6 @@ class Phi4MMAudioFeatureInputs(TypedDict):
     data: Tuple[NestedTensors]
     """Shape: `((batch_size * num_audios, 80, M), )"""
 
-    audio_embed_sizes: torch.Tensor
-    """Shape: `(batch_size * num_audios)`"""
-
 
 class Phi4MMAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -1621,6 +1618,10 @@ def _call_hf_processor(
             ]
             processed_outputs["num_img_tokens"] = num_img_tokens
             processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds')
+            if "audios" in mm_data:
+                feature_sizes = [size.item() * 8 for size in processed_outputs['audio_embed_sizes']]
+                audio_features = processed_outputs['input_audio_embeds']
+                processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
         else:
             tokenizer = self.info.get_tokenizer()
             processed_outputs = tokenizer(prompt,
@@ -1874,10 +1875,8 @@ def _parse_and_validate_audio_input(
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
 
-
             return Phi4MMAudioFeatureInputs(type="audio_features",
-                                            data=flatten_bn(audio_features, concat=True),
-                                            audio_embed_sizes=flatten_bn(audio_embed_sizes, concat=True))
+                                            data=flatten_bn(audio_features))
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -1907,17 +1906,14 @@ def _process_audio_input(self,
             return audio_input["data"]
 
         audio_features = audio_input["data"]
-        audio_sizes = audio_input["audio_embed_sizes"]
         # (e.g. multiple examples) and the second dim is the multi-audio dim
         # (e.g. multiple audios in the same example)
 
         dtype = next(self.embed_tokens_extend.parameters()).dtype
-        audio_padded_embeds = self.embed_tokens_extend.get_audio_features(
-            audio_features.to(dtype),
+        audio_embeds = [self.embed_tokens_extend.get_audio_features(
+            features.unsqueeze(0).to(dtype),
             audio_projection_mode=audio_projection_mode,
-        )
-        audio_embeds = [audio_padded_embeds[idx, :size]
-                        for idx, size in enumerate(audio_sizes)]
+        ).squeeze(0) for features in audio_features]
         return audio_embeds
 
     def _parse_and_validate_image_input(self,

From f63d7c285fb9cde1e29e75a435dda8b90b244c62 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 2 Apr 2025 00:35:33 +0800
Subject: [PATCH 15/36] unpad audio features

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index a906247ad9f9..98f2aedbee7a 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1549,6 +1549,19 @@ def get_image_size_with_most_features(self) -> ImageSize:
 
         max_side = vit_image_size * self.dynamic_hd
         return ImageSize(height=max_side, width=vit_image_size)
+    
+    def get_audio_feature_nums(self, audio_len: int, sr: float):
+        if sr >= 16000:
+            win_length = 400
+            hop_length = 160
+        elif 8000 <= sr < 16000:
+            win_length = 200
+            hop_length = 80
+        else:
+            raise RuntimeError(f"Input data using an unsupported sample rate: {sr}")
+
+        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
+        return (audio_len - win_length) // hop_length + 1
 
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
@@ -1619,8 +1632,8 @@ def _call_hf_processor(
             processed_outputs["num_img_tokens"] = num_img_tokens
             processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds')
             if "audios" in mm_data:
-                feature_sizes = [size.item() * 8 for size in processed_outputs['audio_embed_sizes']]
                 audio_features = processed_outputs['input_audio_embeds']
+                feature_sizes = [self.info.get_audio_feature_nums(len(audio), sr) for audio, sr in mm_data['audios']]
                 processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
         else:
             tokenizer = self.info.get_tokenizer()

From 83c08fca038d38491e97e6128b1b3642ec930fd6 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 2 Apr 2025 00:59:10 +0800
Subject: [PATCH 16/36] fix v1 audio

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 98f2aedbee7a..7b2651b626b0 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1881,6 +1881,9 @@ def _parse_and_validate_audio_input(
             if isinstance(audio_features, torch.Tensor):
                 assert audio_features.size(0) == len(audio_embed_sizes), (
                     "audio_features and audio_embed_sizes must have the same length")
+            elif is_list_of(audio_features, torch.Tensor):
+                assert len(audio_features) == len(audio_embed_sizes), (
+                    "audio_features and audio_embed_sizes must have the same length")
             elif is_list_of(audio_features, list):
                 assert len(audio_features) == len(audio_embed_sizes), (
                     "audio_features and audio_embed_sizes must have the same length")
@@ -2001,7 +2004,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                              "image_embeds") and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
-            if input_key in ("audio_features",
+            if input_key in ("input_audio_embeds",
                              "audio_embeds") and "audios" not in modalities:
                 modalities["audios"] = self._parse_and_validate_audio_input(
                     **kwargs)
@@ -2044,7 +2047,7 @@ def get_multimodal_embeddings(
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode)
-                multimodal_embeddings += audio_embeddings
+                multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
 

From 1b9f027477373fb20dcc014cfc9f94638c256587 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 2 Apr 2025 01:11:39 +0800
Subject: [PATCH 17/36] clean legacy code

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../vision_language_multi_image.py            |   2 +-
 vllm/model_executor/models/phi4mm.py          | 795 ------------------
 2 files changed, 1 insertion(+), 796 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 98a739169d70..6736d7d72299 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -335,7 +335,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=10000,
+        max_model_len=12800,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 7b2651b626b0..a4b95c743782 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -79,41 +79,6 @@
 }
 
 
-def get_max_dummy_image(ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-
-    max_side = vit_image_size * dynamic_hd_size
-    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
-    return dummy_image
-
-
-# image token length
-def get_max_phi4mm_image_tokens(ctx: InputContext):
-    dummy_image = get_max_dummy_image(ctx)
-
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
-                                                 vit_image_size,
-                                                 vit_patch_size,
-                                                 token_compression_factor)
-    return image_num_tokens
-
-
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
                               image_size):
     best_ratio_diff = float('inf')
@@ -172,181 +137,6 @@ def _get_padding_size(orig_width: int, orig_height: int, target_height: int, tar
     return padding_height, padding_width
 
 
-def dynamic_preprocess(image,
-                       min_num=1,
-                       max_num=12,
-                       image_size=384,
-                       mask_size=27):
-    orig_width, orig_height = image.size
-    target_aspect_ratio, target_height, target_width =\
-          _find_target_aspect_ratio(
-        orig_width, orig_height, image_size, max_num, min_num)
-    padding_height, padding_width = _get_padding_size(image, target_height,
-                                                      target_width)
-
-    # Calculate the ratio
-    ratio_width = target_width / orig_width
-    ratio_height = target_height / orig_height
-    if ratio_width < ratio_height:
-        new_size = (target_width, int(orig_height * ratio_width))
-    else:
-        new_size = (int(orig_width * ratio_height), target_height)
-
-    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
-                                 int(mask_size * target_aspect_ratio[0])))
-    if padding_width >= 14:
-        attention_mask[:, -math.floor(padding_width / 14):] = 0
-    if padding_height >= 14:
-        attention_mask[-math.floor(padding_height / 14):, :] = 0
-    assert attention_mask.sum(
-    ) > 0, f'attention mask is empty {attention_mask}'
-
-    if min(new_size[1], target_height) < 10 or min(new_size[0],
-                                                   target_width) < 10:
-        raise ValueError(f'the aspect ratio is very extreme {new_size}')
-
-    image = T.functional.resize(
-        image,
-        [new_size[1], new_size[0]],
-    )
-
-    resized_img = T.functional.pad(image,
-                                   [0, 0, padding_width, padding_height],
-                                   fill=[255, 255, 255])
-
-    return resized_img, attention_mask
-
-
-def pad_to_max_num_crops(images, max_crops=5):
-    """
-    images: B x 3 x H x W, B<=max_crops
-    """
-    B, _, H, W = images.shape
-    if max_crops > B:
-        pad = torch.zeros(max_crops - B,
-                          3,
-                          H,
-                          W,
-                          dtype=images.dtype,
-                          device=images.device)
-        images = torch.cat([images, pad], dim=0)
-    return images
-
-
-def pad_mask_to_max_num_crops(masks, max_crops=5):
-    B, H, W = masks.shape
-    if max_crops > B:
-        pad = torch.ones(max_crops - B,
-                         H,
-                         W,
-                         dtype=masks.dtype,
-                         device=masks.device)
-        masks = torch.cat([masks, pad], dim=0)
-    return masks
-
-
-def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
-
-    # Basic settings.
-    img_processor = T.Compose([
-        T.ToTensor(),
-        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-    ])
-    # Dynamic HD
-    base_resolution = vit_resolution
-    images = [image.convert('RGB') for image in images]
-    # cover 384 and 448 resolution
-    mask_resolution = base_resolution // vit_patch_size
-    elems, image_attention_masks = [], []
-    for im in images:
-        elem, attention_mask = dynamic_preprocess(im,
-                                                  max_num=dynamic_hd_size,
-                                                  image_size=base_resolution,
-                                                  mask_size=mask_resolution)
-        elems.append(elem)
-        image_attention_masks.append(attention_mask)
-    hd_images = [img_processor(im) for im in elems]
-    global_image = [
-        torch.nn.functional.interpolate(
-            im.unsqueeze(0).float(),
-            size=(base_resolution, base_resolution),
-            mode='bicubic',
-        ).to(im.dtype) for im in hd_images
-    ]
-    shapes = [[im.size(1), im.size(2)] for im in hd_images]
-    mask_shapes = [[mask.size(0), mask.size(1)]
-                   for mask in image_attention_masks]
-    global_attention_mask = [
-        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
-    ]
-    hd_images_reshape = [
-        im.reshape(1, 3, h // base_resolution, base_resolution,
-                   w // base_resolution, base_resolution).permute(
-                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
-                                                 base_resolution).contiguous()
-        for im, (h, w) in zip(hd_images, shapes)
-    ]
-    attention_masks_reshape = [
-        mask.reshape(1, h // mask_resolution, mask_resolution,
-                     w // mask_resolution, mask_resolution).permute(
-                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
-                                                mask_resolution).contiguous()
-        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
-    ]
-    # NOTE token compression is hard coded here, and odd numbers seems to fail
-    downsample_attention_masks = [
-        mask[:, 0::2,
-             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
-                           mask_resolution // 2 + mask_resolution % 2,
-                           mask_resolution // 2 + mask_resolution % 2).permute(
-                               0, 1, 3, 2, 4)
-        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
-    ]
-    downsample_attention_masks = [
-        mask.reshape(mask.size(1) * mask.size(2),
-                     mask.size(3) * mask.size(4))
-        for mask in downsample_attention_masks
-    ]
-    # NOTE hard coded number of tokens
-    num_img_tokens = [
-        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
-        for mask in downsample_attention_masks
-    ]
-
-    hd_images_reshape = [
-        torch.cat([_global_image] + [_im], dim=0)
-        for _global_image, _im in zip(global_image, hd_images_reshape)
-    ]
-    hd_masks_reshape = [
-        torch.cat([_global_mask] + [_mask],
-                  dim=0) for _global_mask, _mask in zip(
-                      global_attention_mask, attention_masks_reshape)
-    ]
-    max_crops = max([img.size(0) for img in hd_images_reshape])
-    image_transformed = [
-        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
-    ]
-    image_transformed = torch.stack(image_transformed, dim=0)
-    mask_transformed = [
-        pad_mask_to_max_num_crops(mask, max_crops) \
-            for mask in hd_masks_reshape
-    ]
-    mask_transformed = torch.stack(mask_transformed, dim=0)
-
-    returned_input_image_embeds = image_transformed
-    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
-    returned_image_attention_mask = mask_transformed
-    returned_num_img_tokens = num_img_tokens
-
-    data = {
-        "pixel_values": returned_input_image_embeds,
-        "image_sizes": returned_image_sizes,
-        "image_attention_mask": returned_image_attention_mask,
-        "num_img_tokens": returned_num_img_tokens,
-    }
-    return data
-
-
 def get_navit_vision_model(layer_idx: int = -1, **kwargs):
     vision_config = {
         "hidden_size": 1152,
@@ -719,188 +509,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
 
-def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
-    """Create a Mel filter-bank the same as SpeechLib FbankFC.
-
-    Args:
-        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
-        n_fft (int): FFT size. int > 0 [scalar]
-        n_mel (int): Mel filter size. int > 0 [scalar]
-        fmin (float): lowest frequency (in Hz). If None use 0.0.
-            float >= 0 [scalar]
-        fmax: highest frequency (in Hz). If None use sample_rate / 2.
-            float >= 0 [scalar]
-
-    Returns
-        out (numpy.ndarray): Mel transform matrix
-            [shape=(n_mels, 1 + n_fft/2)]
-    """
-
-    bank_width = int(n_fft // 2 + 1)
-    if fmax is None:
-        fmax = sample_rate / 2
-    if fmin is None:
-        fmin = 0
-    assert fmin >= 0, "fmin cannot be negative"
-    assert (fmin < fmax <=
-            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
-
-    def mel(f):
-        return 1127.0 * np.log(1.0 + f / 700.0)
-
-    def bin2mel(fft_bin):
-        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
-
-    def f2bin(f):
-        return int((f * n_fft / sample_rate) + 0.5)
-
-    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
-    klo = f2bin(fmin) + 1
-    khi = f2bin(fmax)
-
-    khi = max(khi, klo)
-
-    # Spec 2: SpeechLib uses triangles in Mel space
-    mlo = mel(fmin)
-    mhi = mel(fmax)
-    m_centers = np.linspace(mlo, mhi, n_mels + 2)
-    ms = (mhi - mlo) / (n_mels + 1)
-
-    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
-    for m in range(0, n_mels):
-        left = m_centers[m]
-        center = m_centers[m + 1]
-        right = m_centers[m + 2]
-        for fft_bin in range(klo, khi):
-            mbin = bin2mel(fft_bin)
-            if left < mbin < right:
-                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
-
-    return matrix
-
-
-class LogFbankProcessor:
-
-    def __init__(self):
-
-        self._eightk_method = "fillzero"
-        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
-
-        self._hamming400 = np.hamming(400)  # for 16k audio
-        self._hamming200 = np.hamming(200)  # for 8k audio
-
-    def extract_spectrogram(self, wav, fs):
-        """Extract spectrogram features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        if wav.ndim > 1:
-            wav = np.squeeze(wav)
-
-        # by default, we extract the mean if stereo
-        if len(wav.shape) == 2:
-            wav = wav.mean(1)
-
-        # Resample to 16000 or 8000 if needed
-        if fs > 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
-            fs = 16000
-        elif 8000 < fs < 16000:
-            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
-            fs = 8000
-        elif fs < 8000:
-            raise RuntimeError(f"Unsupported sample rate {fs}")
-
-        if fs == 8000:
-            if self._eightk_method == "resample":
-                # Input audio is 8 kHz. Convert to 16 kHz before feature
-                # extraction
-                wav = scipy.signal.resample_poly(wav, 2, 1)
-                fs = 16000
-            # Do nothing here for fillzero method
-        elif fs != 16000:
-            # Input audio is not a supported sample rate.
-            raise RuntimeError(
-                f"Input data using an unsupported sample rate: {fs}")
-
-        preemphasis = 0.97
-
-        if fs == 8000:
-            n_fft = 256
-            win_length = 200
-            hop_length = 80
-            fft_window = self._hamming200
-        elif fs == 16000:
-            n_fft = 512
-            win_length = 400
-            hop_length = 160
-            fft_window = self._hamming400
-
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        n_batch = (wav.shape[0] - win_length) // hop_length + 1
-        # Here we don't use stride_tricks since the input array may not satisfy
-        # memory layout requirement and we need writeable output
-        # Here we only use list of views before copy to destination
-        # so it is more efficient than broadcasting
-        y_frames = np.array(
-            [
-                wav[_stride:_stride + win_length]
-                for _stride in range(0, hop_length * n_batch, hop_length)
-            ],
-            dtype=np.float32,
-        )
-
-        # Spec 2: SpeechLib applies preemphasis within each batch
-        y_frames_prev = np.roll(y_frames, 1, axis=1)
-        y_frames_prev[:, 0] = y_frames_prev[:, 1]
-        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
-
-        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
-                        axis=1).astype(np.complex64)
-
-        if fs == 8000:
-            # Need to pad the output to look like 16 kHz data but with zeros in
-            # the 4 to 8 kHz bins.
-            frames, bins = S.shape
-            padarray = np.zeros((frames, bins))
-            S = np.concatenate((S[:, 0:-1], padarray),
-                               axis=1)  # Nyquist bin gets set to zero
-
-        spec = np.abs(S).astype(np.float32)
-        return spec
-
-    def extract_features(self, wav, fs):
-        """Extract log filterbank features from waveform.
-        Args:
-            wav (1D array): waveform of the input
-            fs (int): sampling rate of the waveform, 16000 or 8000.
-                If fs=8000, the waveform will be resampled to 16000Hz.
-        Output:
-            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
-                D=80, and T is the number of frames.
-        """
-        spec = self.extract_spectrogram(wav, fs)
-        spec_power = spec**2
-
-        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
-        log_fbank = np.log(fbank_power).astype(np.float32)
-
-        return log_fbank
-
-
-@lru_cache
-def audio_feature_extractor() -> LogFbankProcessor:
-    # Creates an instance of the audio processor, needed to extract the
-    # the audio features from the sound file
-    # LRU cache ensures that we only make one copy
-    return LogFbankProcessor()
-
-
 def _compute_num_image_tokens(
     orig_width: int,
     orig_height: int,
@@ -1019,255 +627,6 @@ def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
     return T, mel_bins
 
 
-def _get_audio_embed_sizes(audios, ctx: InputContext):
-    """
-    Get the audio embedding sizes for each audio file.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-
-    Returns:
-        List[int]: List of audio embedding sizes.
-    """
-    audio_embed_sizes = []
-    for audio in audios:
-        audio_data, sf = audio
-        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
-        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                     audio_frames)
-        audio_embed_sizes.append(audio_embed_size)
-    return audio_embed_sizes
-
-
-def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
-    """
-    The following will search for `<|audio_{idx}|>` tokens and
-    return a mapping of audio placeholder tokens to audio placeholder token ids
-    based on the size of the audio embeddings.
-
-    Args:
-        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
-            waveform and sample rate.
-        ctx (InputContext): Input context.
-        prompt_str (str): The prompt string.
-
-    Returns:
-        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
-        placeholder token ids.
-
-    """
-    if len(audios) == 0:
-        return {}
-
-    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
-    audio_ids = [int(audio_id) for audio_id in audio_ids]
-    assert len(audio_ids) == len(
-        audio_embed_sizes
-    ), "Number of audio tokens and audio features do not match"
-    assert tuple(audio_ids) == tuple(range(1,
-                                           len(audio_ids) +
-                                           1)), "Audio ids are not in order!"
-    audio_id_to_input_ids = {
-        f"<|audio_{audio_id}|>":
-        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
-        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
-    }
-
-    return audio_id_to_input_ids
-
-
-def _count_image_tokens(images, ctx: InputContext):
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-    token_compression_factor = prepro_config['token_compression_factor']
-
-    image_token_counts = [
-        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size, token_compression_factor)
-        for image in images
-    ]
-    return image_token_counts
-
-
-def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
-    if len(images) == 0:
-        return {}
-
-    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
-    image_ids = [int(image_id) for image_id in image_ids]
-    assert len(image_ids) == len(
-        set(image_ids)), "Duplicate image tokens in prompt"
-    assert len(images) == len(
-        image_ids), "Number of images and image tokens in prompt do not match"
-
-    # NOTE the following assertion is not strictly necessary
-    assert tuple(image_ids) == tuple(range(1,
-                                           len(image_ids) +
-                                           1)), "Image ids are not in order"
-
-    image_token_counts = _count_image_tokens(images, ctx)
-    image_id_to_input_ids = {
-        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
-        for image_id, num_tokens in zip(image_ids, image_token_counts)
-    }
-    return image_id_to_input_ids
-
-
-def input_processor_for_phi4mm(ctx: InputContext,
-                               inputs: DecoderOnlyInputs) -> TokenInputs:
-    """
-    Implements the input processor, which transforms the input prompt ids
-    to include the audio placeholder token.  This will become the `input_ids`
-    in `forward` for the model.
-
-    Args:
-        ctx (InputContext): Input context.
-        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
-        to process.
-
-    Returns:
-        TokenInputs: Processed inputs
-    """
-    multi_modal_data = inputs.get("multi_modal_data")
-    if (multi_modal_data is None or
-        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
-        # pure text input, so no need to do pre-processing
-        return inputs
-
-    prompt_str = inputs.get("prompt")
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    # for offline_inference, we will get str input and we parse MM special
-    # tokens from it
-    # (ignore prompt_token_ids)
-    # for OAI server, we will get prompt_token_ids, where MM special tokens
-    # are already parsed
-
-    if 'audio' in multi_modal_data:
-        audios = multi_modal_data["audio"]
-
-        if not isinstance(audios, list):
-            audios = [audios]
-        if prompt_str is not None:
-            audio_id_to_input_ids = _get_audio_id_to_input_ids(
-                audios, ctx, prompt_str=prompt_str)
-            audio_embed_sizes = []
-        elif prompt_token_ids is not None:
-            audio_id_to_input_ids = {}
-            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
-    else:
-        audio_id_to_input_ids = {}
-        audio_embed_sizes = []
-
-    if 'image' in multi_modal_data:
-        # PIL Image or list of PIL Images
-        images = multi_modal_data["image"]
-        if not isinstance(images, list):
-            images = [images]
-        if prompt_str is not None:
-            image_id_to_input_ids = _get_image_id_to_input_ids(
-                images, prompt_str, ctx)
-            image_token_counts = []
-        elif prompt_token_ids is not None:
-            image_id_to_input_ids = {}
-            image_token_counts = _count_image_tokens(images, ctx)
-    else:
-        image_id_to_input_ids = {}
-        image_token_counts = []
-
-    # Handle the case where the prompt is a string and we need to manually
-    # tokenize it.
-    # In this case, the `audio_id_to_input_ids` dict will be mapping from
-    # an audio placeholder
-    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
-    # given audio length.
-    if prompt_str:
-        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
-        prompt_chunk_strings = re.split(pattern, prompt_str)
-        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
-
-        # Create the new input_ids with the placeholder image and audio
-        # tokens inserted
-        tokenizer = cached_tokenizer_from_config(ctx.model_config)
-        input_ids = []
-        has_imag, has_audio, has_user_text_input = False, False, False
-        for prompt_chunk_string in prompt_chunk_strings:
-            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
-                has_imag = True
-            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
-                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
-                has_audio = True
-            else:
-                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
-                if not has_user_text_input:
-                    for token_id in curr_token_ids:
-                        if token_id not in NON_USER_INPUT_TOKENS:
-                            has_user_text_input = True
-                            break
-                input_ids.extend(curr_token_ids)
-        if has_audio and has_imag and has_user_text_input:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-    # Handle the case where the prompt is already tokenized
-    else:
-        assert prompt_token_ids is not None, \
-            "If string prompt isn't provided, prompt_token_ids must be"
-
-        i = 0
-        input_ids = prompt_token_ids
-        # only needed for later assertion
-        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
-        image_token_count_iter = iter(image_token_counts)
-        audio_embed_size_iter = iter(audio_embed_sizes)
-        while i < len(input_ids):
-            token_id = input_ids[i]
-            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
-                token_count = next(audio_embed_size_iter)
-                audio_cnt += 1
-            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
-                token_count = next(image_token_count_iter)
-                img_cnt += 1
-            else:
-                user_text_input_cnt += 1 if token_id not in \
-                    NON_USER_INPUT_TOKENS else 0
-                i += 1
-                continue
-            tokens = [token_id] * token_count
-            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
-            i += token_count
-
-        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
-            raise ValueError(
-                "Phi4MMForCausalLM does not support text + audio + image" +
-                " inputs in the same prompt")
-        # If the below assertion fails, it might be that input pure-text
-        # messages contain image/audio special tokens literally
-        # (<|endoftext10|>, <|endoftext11|>).
-        assert (img_cnt == len(image_token_counts)), (
-            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
-            f"does not match number of images ({len(image_token_counts)})")
-        assert (audio_cnt == len(audio_embed_sizes)), (
-            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
-            f"does not match number of audios ({len(audio_embed_sizes)})")
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(
-        prompt_token_ids=input_ids,
-        prompt=prompt_str,
-        multi_modal_data=multi_modal_data,
-    )
-
-
 def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int:
     """
     Compute the audio embedding size based on the audio frames and
@@ -1293,160 +652,6 @@ def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
     return 10000
 
 
-def dummy_audio_for_phi4mm(audio_count: int) -> dict:
-    """
-    Create dummy audio data for the Phi4MM model, which is used for profiling.
-
-    Args:
-        audio_count (int): Number of audio samples.
-
-    Returns:
-        dict: Dummy audio data.
-    """
-    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
-    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
-
-
-def dummy_image_for_phi4mm(width: int, height: int):
-    image = Image.new('RGB', (width, height), color='black')
-    return image
-
-
-def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
-                          mm_counts: Mapping[str, int]) -> DummyData:
-    """
-    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
-    which is used for profiling.
-
-    In this case, the sequence data is a bunch of 0s with a number of audio 
-    tokens that correspond to the audio embed size of the 
-    _AUDIO_MAX_SOUNDFILE_SIZE.
-
-    Args:
-        ctx (InputContext): Input context.
-        seq_len (int): Length of the sequence.
-        mm_counts (Mapping[str, int]): Multi-modal counts.
-
-    Returns:
-        Tuple: Dummy sequence data and dummy audio data.
-    """
-    audio_count = mm_counts["audio"]
-    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
-                                                   DUMMY_SAMPLING_FREQUENCY)
-    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
-                                                   audio_frames)
-
-    image_count = mm_counts["image"]
-    dummy_image = get_max_dummy_image(ctx)
-    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
-    total_image_tokens = image_count * max_image_tokens
-
-    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
-        raise RuntimeError(
-            f"Phi4MM cannot process {audio_count} audios and {image_count}"
-            f"images in a prompt, please increase max_model_len to be at"
-            f" larger than "
-            f"{audio_feature_size * audio_count + total_image_tokens}"
-            " or reduce audio/image limit by --limit-mm-per-prompt.")
-
-    if audio_feature_size * audio_count > total_image_tokens:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
-            (0, seq_len - audio_feature_size * audio_count),
-        )
-        mm_data = {
-            "audio": dummy_audio_for_phi4mm(audio_count),
-        }
-    else:
-        seq_data = SequenceData.from_prompt_token_counts(
-            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
-            (0, seq_len - total_image_tokens),
-        )
-        mm_data = {
-            "image": [dummy_image] * image_count,
-        }
-    return DummyData(seq_data, mm_data)
-
-
-def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalInputs:
-    """
-    This function is used to create the MultiModalInputs for the Phi4MM 
-    (audio) model.
-    Specifically, for audio, we extract the audio features from the sound 
-    file and create pairs of audio features and audio embed lengths (the
-    latter of which is used to repeat the audio placeholder token in the 
-    input prompt IDs).
-    These pairs are used, downstream, in `_audio_features_to_embeddings`
-    (via `_process_audio_input`).
-
-    Note that the incoming audio data (each entry in `data`) is a tuple of 
-    the audio data and the sampling frequency (e.g. from soundfile.read).
-
-    Args:
-        ctx (InputContext): Input context.
-        data (object): Audio data.
-
-    Returns:
-        MultiModalInputs: Multi-modal inputs.
-    """
-    if not isinstance(data, list):
-        data = [data]
-
-    if len(data) == 0:
-        return MultiModalInputs()
-
-    audio_features = []
-    for audio_input in data:
-        if not isinstance(audio_input, tuple):
-            raise NotImplementedError(
-                f"Unsupported data type: {type(audio_input)}")
-
-        audio, sf = audio_input
-        feature_extractor = audio_feature_extractor()
-        single_audio_features = feature_extractor.extract_features(audio, sf)
-        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
-                       feature_extractor.stride)
-        audio_frames = len(single_audio_features) * feat_stride
-        single_audio_embed_size = _compute_audio_embed_size(
-            ctx.get_hf_config(), audio_frames)
-        single_audio_feature_audio_len_pair = (
-            single_audio_features,
-            [single_audio_embed_size],
-        )
-        audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalInputs({"audio_features": audio_features})
-
-
-def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
-    if not isinstance(data, list):
-        data = [data]
-    # data: list of PIL images
-    if len(data) == 0:
-        return MultiModalInputs()
-    hf_config = ctx.get_hf_config()
-    vision_encoder_name = hf_config.img_processor
-    if vision_encoder_name is None:
-        vision_encoder_name = SIGLIP_NAME
-    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
-    dynamic_hd_size = prepro_config['dynamic_hd']
-    vit_image_size = prepro_config['vit_image_size']
-    vit_patch_size = prepro_config['vit_patch_size']
-
-    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
-                                  vit_patch_size)
-    return MultiModalInputs({
-        "pixel_values":
-        image_input_dict["pixel_values"],
-        "image_sizes":
-        image_input_dict["image_sizes"],
-        "image_attention_mask":
-        image_input_dict["image_attention_mask"],
-        "num_img_tokens":
-        image_input_dict["num_img_tokens"],
-    })
-
-
 def cat_with_pad(tensors, dim, padding_value=0):
     """
     cat along dim, while pad to max for all other dims

From 76f8b8e94b2458978110fc743ca2d8bfac11a967 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 2 Apr 2025 01:48:28 +0800
Subject: [PATCH 18/36] clean up

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/phi4mm.py   | 99 +++++++++-----------------
 2 files changed, 35 insertions(+), 66 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1b742717885e..78c45db1025b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -920,7 +920,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `microsoft/Phi-4-multimodal-instruct`, etc.
   * ✅︎
   *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index a4b95c743782..8b145cb68cde 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -53,11 +53,6 @@
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011
 
 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000
-DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
-
-DYNAMIC_HD = 16
-AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
-IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
 
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
@@ -68,15 +63,6 @@
         'token_compression_factor': 2,
     },
 }
-logger = logging.get_logger(__name__)
-# This is a workaround to prevent text (user input) + audio + image
-# from being used in the same prompt.
-# It includes token ids for "/n" and tokens in added_tokens_decoder
-# from the tokenizer_confg.json file.
-NON_USER_INPUT_TOKENS = {
-    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
-    200023, 200024, 200025, 200026, 200027, 200028
-}
 
 
 def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
@@ -116,7 +102,6 @@ def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int
         # calculate the target width and height
         target_width = image_size * target_aspect_ratio[0]
         target_height = image_size * target_aspect_ratio[1]
-        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
     else:
         target_width = image_size * w_crop_num
         target_height = image_size * h_crop_num
@@ -588,45 +573,6 @@ def _compute_num_image_tokens(
             num_hd_newline_tokens + num_global_image_newline_tokens)
 
 
-def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
-    """
-    Compute the output size of the `extract_features` method.
-
-    Args:
-        wav_length (int): Length of the input waveform in samples.
-        fs (int): Sampling rate of the waveform, either 16000 or 8000.
-
-    Returns:
-        tuple (int, int): Output size as (T, D), where:
-            T: Number of time frames.
-            D: Number of Mel filterbank bins (80).
-    """
-
-    # Resample to 16000 or 8000 if needed
-    if fs > 16000:
-        wav_length //= fs // 16000
-        fs = 16000
-    elif 8000 <= fs < 16000:
-        # We'll resample to 16K from 8K
-        wav_length *= 2
-        fs = 16000
-    elif fs < 8000:
-        raise RuntimeError(f"Unsupported sample rate {fs}")
-
-    # Spectrogram parameters for 16 kHz
-    win_length = 400  # Frame length in samples
-    hop_length = 160  # Frame shift in samples
-    mel_bins = 80  # Number of mel filterbank bins
-
-    # Calculate number of frames (T)
-    T = (wav_length - win_length) // hop_length + 1
-    if T < 1:
-        raise ValueError("Waveform too short for given parameters.")
-
-    # Return time frames (T) and mel bins (D)
-    return T, mel_bins
-
-
 def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int:
     """
     Compute the audio embedding size based on the audio frames and
@@ -755,18 +701,40 @@ def get_image_size_with_most_features(self) -> ImageSize:
         max_side = vit_image_size * self.dynamic_hd
         return ImageSize(height=max_side, width=vit_image_size)
     
-    def get_audio_feature_nums(self, audio_len: int, sr: float):
-        if sr >= 16000:
-            win_length = 400
-            hop_length = 160
+    def get_audio_feature_nums(self, audio_len: int, sr: float) -> int:
+        """
+        Compute the output size of the `extract_features` method.
+
+        Args:
+            audio_len (int): Length of the input waveform in samples.
+            sr (float): Sampling rate of the waveform, either 16000 or 8000.
+
+        Returns:
+            tuple (int, int): Output size as (T, D), where:
+                T: Number of time frames.
+                D: Number of Mel filterbank bins (80).
+        """
+
+        # Resample to 16000 or 8000 if needed
+        if sr > 16000:
+            audio_len //= sr // 16000
         elif 8000 <= sr < 16000:
-            win_length = 200
-            hop_length = 80
-        else:
-            raise RuntimeError(f"Input data using an unsupported sample rate: {sr}")
+            # We'll resample to 16K from 8K
+            audio_len *= 2
+        elif sr < 8000:
+            raise RuntimeError(f"Unsupported sample rate {sr}")
+
+        # Spectrogram parameters for 16 kHz
+        win_length = 400  # Frame length in samples
+        hop_length = 160  # Frame shift in samples
 
-        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
-        return (audio_len - win_length) // hop_length + 1
+        # Calculate number of frames (T)
+        T = (audio_len - win_length) // hop_length + 1
+        if T < 1:
+            raise ValueError("Waveform too short for given parameters.")
+
+        # Return time frames (T)
+        return T
 
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
@@ -870,6 +838,7 @@ def _get_prompt_updates(
     ) -> Sequence[PromptUpdate]:
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+        feature_extractor = self.info.get_feature_extractor()
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
@@ -892,7 +861,7 @@ def get_audio_replacement_phi4mm(item_idx: int):
             audios = mm_items.get_items("audio", AudioProcessorItems)
             # TODO(Isotr0py): support embedding inputs
             audio_len = audios.get_audio_length(item_idx)
-            audio_frames, _ = compute_logfbank_output_size(audio_len, DUMMY_SAMPLING_FREQUENCY)
+            audio_frames = self.info.get_audio_feature_nums(audio_len, feature_extractor.sampling_rate)
             audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames)
 
             audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size

From fb6b659d9a5d5b74c7765fb71f89f8510d2fb18f Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 00:17:26 +0800
Subject: [PATCH 19/36] clean up

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/vision_language.py |   2 +-
 .../vision_language_multi_image.py            |   2 +-
 vllm/model_executor/models/phi4mm.py          | 310 ++++++++----------
 3 files changed, 141 insertions(+), 173 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index c1115708505a..754e63a68428 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -724,7 +724,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 2fb85c597974..87a988ee345b 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -392,7 +392,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=12800,
+        max_model_len=25600,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 8b145cb68cde..194e82448746 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -65,50 +65,6 @@
 }
 
 
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
-                              image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,):
-
-    w_crop_num = math.ceil(orig_width / float(image_size))
-    h_crop_num = math.ceil(orig_height / float(image_size))
-    if w_crop_num * h_crop_num > max_num:
-        aspect_ratio = orig_width / orig_height
-
-        # calculate the existing image aspect ratio
-        target_ratios = set((i, j) for i in range(1, max_num + 1)
-                            for j in range(1, max_num + 1)
-                            if i * j <= max_num and i * j >= min_num)
-        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-        # find the closest aspect ratio to the target
-        target_aspect_ratio = find_closest_aspect_ratio(
-            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-        # calculate the target width and height
-        target_width = image_size * target_aspect_ratio[0]
-        target_height = image_size * target_aspect_ratio[1]
-    else:
-        target_width = image_size * w_crop_num
-        target_height = image_size * h_crop_num
-        target_aspect_ratio = (w_crop_num, h_crop_num)
-    return target_aspect_ratio, target_height, target_width
-
-
 def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
@@ -494,110 +450,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
 Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
 
 
-def _compute_num_image_tokens(
-    orig_width: int,
-    orig_height: int,
-    dynamic_hd_size: int,
-    vit_image_size: int,
-    vit_patch_size: int,
-    token_compression_factor: int = 2,
-):
-    """
-    compute the number of tokens an image is expected to take up considering 
-    the image encoder architecture and exclude output features containing 
-    only padding pixels
-
-    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
-    32x32 feature map
-    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
-    """
-    assert vit_image_size % vit_patch_size == 0, \
-        "vit_image_size must be divisible by vit_patch_size"
-    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-        "vit_image_size // vit_patch_size must be divisible by "\
-            "token_compression_factor"
-
-    target_aspect_ratio, target_height, target_width = (
-        _find_target_aspect_ratio(orig_width,
-                                  orig_height,
-                                  vit_image_size,
-                                  dynamic_hd_size,
-                                  min_num=1))
-    assert target_aspect_ratio[
-        0] * vit_image_size == target_width, \
-            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-    assert target_aspect_ratio[
-        1] * vit_image_size == target_height, \
-            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
-    assert (target_height % vit_image_size == 0
-            and target_width % vit_image_size == 0)
-
-    padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height,
-                                                      target_width)
-    assert padding_width == 0 or padding_height == 0, \
-        "padding_width or padding_height must be 0"
-
-    target_feat_width = target_width // vit_patch_size
-    target_feat_height = target_height // vit_patch_size
-    if padding_width >= vit_patch_size:
-        assert padding_height == 0, "padding_height not 0"
-        non_pad_feat_width = target_feat_width - math.floor(
-            padding_width / vit_patch_size)
-        non_pad_feat_height = target_feat_height
-    elif padding_height >= vit_patch_size:
-        assert padding_width == 0, "padding_width not 0"
-        non_pad_feat_height = target_feat_height - math.floor(
-            padding_height / vit_patch_size)
-        non_pad_feat_width = target_feat_width
-    else:
-        # small padding shorter than a vit patch
-        non_pad_feat_width = target_feat_width
-        non_pad_feat_height = target_feat_height
-
-    feat_width = non_pad_feat_width // token_compression_factor
-    feat_height = non_pad_feat_height // token_compression_factor
-    # NOTE it's possible that the non-padding feature is not divisible
-    if non_pad_feat_width % token_compression_factor != 0:
-        feat_width += 1
-    if non_pad_feat_height % token_compression_factor != 0:
-        feat_height += 1
-    num_hd_patch_tokens = feat_width * feat_height
-    num_hd_newline_tokens = feat_height
-    vit_feature_size = vit_image_size // vit_patch_size
-    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
-    num_sep_tokens = 1
-    num_global_image_newline_tokens = \
-        vit_feature_size // token_compression_factor
-
-    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-            num_hd_newline_tokens + num_global_image_newline_tokens)
-
-
-def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int:
-    """
-    Compute the audio embedding size based on the audio frames and
-    compression rate.
-    """
-    compression_rate = hf_config.embd_layer['audio_embd_layer'][
-        'compression_rate']
-    # NOTE: this is a hard-coded value but might be configurable in the future
-    qformer_compression_rate = 1
-    integer = audio_frames // compression_rate
-    remainder = audio_frames % compression_rate
-
-    result = integer if remainder == 0 else integer + 1
-
-    integer = result // qformer_compression_rate
-    remainder = result % qformer_compression_rate
-    result = integer if remainder == 0 else integer + 1  # qformer compression
-
-    return result
-
-
-def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
-    return 10000
-
-
 def cat_with_pad(tensors, dim, padding_value=0):
     """
     cat along dim, while pad to max for all other dims
@@ -656,12 +508,119 @@ def get_mm_max_tokens_per_item(
         }
 
     def get_max_audio_tokens(self) -> int:
-        return 188
+        sr = self.get_feature_extractor().sampling_rate
+        num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
+        return self._compute_audio_embed_size(num_frames)
     
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
         return self.get_num_image_tokens(
             image_width=target_width, image_height=target_height)
+    
+    def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,):
+        w_crop_num = math.ceil(orig_width / float(image_size))
+        h_crop_num = math.ceil(orig_height / float(image_size))
+        if w_crop_num * h_crop_num > max_num:
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = set((i, j) for i in range(1, max_num + 1)
+                                for j in range(1, max_num + 1)
+                                if i * j <= max_num and i * j >= min_num)
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            image_processor = self.get_hf_processor().image_processor
+            target_aspect_ratio = image_processor.find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size,)
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+        else:
+            target_width = image_size * w_crop_num
+            target_height = image_size * h_crop_num
+            target_aspect_ratio = (w_crop_num, h_crop_num)
+        return target_aspect_ratio, target_height, target_width
+    
+    def _compute_num_image_tokens(
+        self,
+        orig_width: int,
+        orig_height: int,
+        dynamic_hd_size: int,
+        vit_image_size: int,
+        vit_patch_size: int,
+        token_compression_factor: int = 2,
+    ):
+        """
+        compute the number of tokens an image is expected to take up considering 
+        the image encoder architecture and exclude output features containing 
+        only padding pixels
+
+        for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+        32x32 feature map
+        NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+        """
+        assert vit_image_size % vit_patch_size == 0, \
+            "vit_image_size must be divisible by vit_patch_size"
+        assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
+            "vit_image_size // vit_patch_size must be divisible by "\
+                "token_compression_factor"
+
+        target_aspect_ratio, target_height, target_width = (
+            self._find_target_aspect_ratio(orig_width,
+                                    orig_height,
+                                    vit_image_size,
+                                    dynamic_hd_size,
+                                    min_num=1))
+        assert target_aspect_ratio[
+            0] * vit_image_size == target_width, \
+                f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+        assert target_aspect_ratio[
+            1] * vit_image_size == target_height, \
+                f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+        assert (target_height % vit_image_size == 0
+                and target_width % vit_image_size == 0)
+
+        padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height,
+                                                        target_width)
+        assert padding_width == 0 or padding_height == 0, \
+            "padding_width or padding_height must be 0"
+
+        target_feat_width = target_width // vit_patch_size
+        target_feat_height = target_height // vit_patch_size
+        if padding_width >= vit_patch_size:
+            assert padding_height == 0, "padding_height not 0"
+            non_pad_feat_width = target_feat_width - math.floor(
+                padding_width / vit_patch_size)
+            non_pad_feat_height = target_feat_height
+        elif padding_height >= vit_patch_size:
+            assert padding_width == 0, "padding_width not 0"
+            non_pad_feat_height = target_feat_height - math.floor(
+                padding_height / vit_patch_size)
+            non_pad_feat_width = target_feat_width
+        else:
+            # small padding shorter than a vit patch
+            non_pad_feat_width = target_feat_width
+            non_pad_feat_height = target_feat_height
+
+        feat_width = non_pad_feat_width // token_compression_factor
+        feat_height = non_pad_feat_height // token_compression_factor
+        # NOTE it's possible that the non-padding feature is not divisible
+        if non_pad_feat_width % token_compression_factor != 0:
+            feat_width += 1
+        if non_pad_feat_height % token_compression_factor != 0:
+            feat_height += 1
+        num_hd_patch_tokens = feat_width * feat_height
+        num_hd_newline_tokens = feat_height
+        vit_feature_size = vit_image_size // vit_patch_size
+        num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
+        num_sep_tokens = 1
+        num_global_image_newline_tokens = \
+            vit_feature_size // token_compression_factor
+
+        return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
+                num_hd_newline_tokens + num_global_image_newline_tokens)
 
     def get_num_image_tokens(
         self,
@@ -680,7 +639,7 @@ def get_num_image_tokens(
 
         dynamic_hd_size = self.dynamic_hd
 
-        image_num_tokens = _compute_num_image_tokens(
+        image_num_tokens = self._compute_num_image_tokens(
             image_width, image_height, 
             dynamic_hd_size=dynamic_hd_size,
             vit_image_size=vit_image_size,
@@ -701,7 +660,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
         max_side = vit_image_size * self.dynamic_hd
         return ImageSize(height=max_side, width=vit_image_size)
     
-    def get_audio_feature_nums(self, audio_len: int, sr: float) -> int:
+    def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
         """
         Compute the output size of the `extract_features` method.
 
@@ -729,12 +688,34 @@ def get_audio_feature_nums(self, audio_len: int, sr: float) -> int:
         hop_length = 160  # Frame shift in samples
 
         # Calculate number of frames (T)
-        T = (audio_len - win_length) // hop_length + 1
-        if T < 1:
+        num_frames = (audio_len - win_length) // hop_length + 1
+        if num_frames < 1:
             raise ValueError("Waveform too short for given parameters.")
 
         # Return time frames (T)
-        return T
+        return num_frames
+    
+    def _compute_audio_embed_size(self, audio_frames: int) -> int:
+        """
+        Compute the audio embedding size based on the audio frames and
+        compression rate.
+        """
+        hf_config = self.get_hf_config()
+        compression_rate = hf_config.embd_layer['audio_embd_layer'][
+            'compression_rate']
+        # NOTE: this is a hard-coded value but might be configurable 
+        # in the future
+        qformer_compression_rate = 1
+        integer = audio_frames // compression_rate
+        remainder = audio_frames % compression_rate
+
+        result = integer if remainder == 0 else integer + 1
+
+        integer = result // qformer_compression_rate
+        remainder = result % qformer_compression_rate
+        result = integer if remainder == 0 else integer + 1  # qformer compression
+
+        return result
 
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
@@ -806,7 +787,7 @@ def _call_hf_processor(
             processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds')
             if "audios" in mm_data:
                 audio_features = processed_outputs['input_audio_embeds']
-                feature_sizes = [self.info.get_audio_feature_nums(len(audio), sr) for audio, sr in mm_data['audios']]
+                feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio, sr in mm_data['audios']]
                 processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
         else:
             tokenizer = self.info.get_tokenizer()
@@ -861,8 +842,8 @@ def get_audio_replacement_phi4mm(item_idx: int):
             audios = mm_items.get_items("audio", AudioProcessorItems)
             # TODO(Isotr0py): support embedding inputs
             audio_len = audios.get_audio_length(item_idx)
-            audio_frames = self.info.get_audio_feature_nums(audio_len, feature_extractor.sampling_rate)
-            audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames)
+            audio_frames = self.info.get_audio_num_frames(audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
 
             audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
 
@@ -888,16 +869,6 @@ def get_audio_replacement_phi4mm(item_idx: int):
         return image_repl + audio_repl
 
 
-# @MULTIMODAL_REGISTRY.register_input_mapper("audio",
-#                                            input_mapper_for_phi4mm_audio)
-# @MULTIMODAL_REGISTRY.register_input_mapper("image",
-#                                            input_mapper_for_phi4mm_image)
-# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-#     "audio", get_max_phi4mm_audio_tokens)
-# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-#     "image", get_max_phi4mm_image_tokens)
-# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
-# @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
 @MULTIMODAL_REGISTRY.register_processor(
     Phi4MMMultiModalProcessor,
     info=Phi4MMProcessingInfo,
@@ -1055,10 +1026,7 @@ def _parse_and_validate_audio_input(
             if isinstance(audio_features, torch.Tensor):
                 assert audio_features.size(0) == len(audio_embed_sizes), (
                     "audio_features and audio_embed_sizes must have the same length")
-            elif is_list_of(audio_features, torch.Tensor):
-                assert len(audio_features) == len(audio_embed_sizes), (
-                    "audio_features and audio_embed_sizes must have the same length")
-            elif is_list_of(audio_features, list):
+            elif is_list_of(audio_features, (torch.Tensor, list)):
                 assert len(audio_features) == len(audio_embed_sizes), (
                     "audio_features and audio_embed_sizes must have the same length")
             else:

From 9fab0e45b978a6e88a1705ad803ad7f244ec446e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 00:58:57 +0800
Subject: [PATCH 20/36] minor refactor

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/audio_language.py |  2 +-
 vllm/model_executor/models/phi4mm.py         | 67 ++++++++++----------
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 840892ea0701..fff06e466359 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 194e82448746..eac8c3fb57f6 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -774,26 +774,27 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            if "audios" in mm_data:
-                sr = self.info.get_feature_extractor().sampling_rate
-                mm_data['audios'] = [(data, sr) for data in mm_data['audios']]
-            processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
-            num_img_tokens = [
-                self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1])
-                for img_size in processed_outputs["image_sizes"]
-            ]
-            processed_outputs["num_img_tokens"] = num_img_tokens
-            processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds')
-            if "audios" in mm_data:
-                audio_features = processed_outputs['input_audio_embeds']
-                feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio, sr in mm_data['audios']]
-                processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
-        else:
-            tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+        if not mm_data:
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        sr = self.info.get_feature_extractor().sampling_rate
+        if (audio_data := mm_data.get("audios", [])):
+            mm_data['audios'] = [(data, sr) for data in audio_data]
+
+        processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
+
+        num_img_tokens = [
+            self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1])
+            for img_size in processed_outputs["image_sizes"]
+        ]
+        processed_outputs["num_img_tokens"] = num_img_tokens
+
+        audio_features = processed_outputs['input_audio_embeds']
+        feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data]
+        processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
+    
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -802,7 +803,7 @@ def _get_mm_fields_config(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
+            input_image_embeds=MultiModalFieldConfig.batched("image"),
             image_attention_mask=MultiModalFieldConfig.batched("image"),
             image_sizes=MultiModalFieldConfig.batched("image"),
             num_img_tokens=MultiModalFieldConfig.batched("image"),
@@ -1076,8 +1077,8 @@ def _process_audio_input(self,
 
     def _parse_and_validate_image_input(self,
                                         **kwargs: object) -> Optional[Dict]:
-        pixel_values: NestedTensors = kwargs.get("pixel_values")
-        if pixel_values is None:
+        input_image_embeds: NestedTensors = kwargs.get("input_image_embeds")
+        if input_image_embeds is None:
             return None
 
         image_sizes = kwargs.get("image_sizes")
@@ -1086,23 +1087,23 @@ def _parse_and_validate_image_input(self,
         assert image_sizes is not None and image_attention_mask is not None\
               and num_img_tokens is not None, "Missing image inputs"
 
-        if is_list_of(pixel_values, torch.Tensor):
-            assert all(p.dim() == 5 for p in pixel_values), "Incorrect image inputs"
+        if is_list_of(input_image_embeds, torch.Tensor):
+            assert all(p.dim() == 5 for p in input_image_embeds), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
             # need to pad along num_hd_patches.
             # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
-            pixel_values = cat_with_pad(pixel_values, dim=0)
-        elif isinstance(pixel_values, torch.Tensor):
+            input_image_embeds = cat_with_pad(input_image_embeds, dim=0)
+        elif isinstance(input_image_embeds, torch.Tensor):
             # dimension: batch_size, num_img_per_example, num_hd_patches,
             # channels, height, width.
             # we flatten first 2 dims to make it a single large batch for
             # SigLIP Encoder.
-            assert pixel_values.dim() == 6, "Incorrect image inputs"
-            pixel_values = pixel_values.flatten(0, 1)
+            assert input_image_embeds.dim() == 6, "Incorrect image inputs"
+            input_image_embeds = input_image_embeds.flatten(0, 1)
         else:
-            raise ValueError("Incorrect pixel_values inputs")
+            raise ValueError("Incorrect input_image_embeds inputs")
 
         if isinstance(image_attention_mask, list):
             image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
@@ -1129,8 +1130,8 @@ def _parse_and_validate_image_input(self,
             raise ValueError("Incorrect image_attention_mask inputs")
 
         return Phi4MMImagePixelInputs(
-            type="pixel_values_videos",
-            data=pixel_values,
+            type="pixel_values",
+            data=input_image_embeds,
             image_sizes=image_sizes,
             image_attention_mask=image_attention_mask,
             num_img_tokens=num_img_tokens,
@@ -1142,7 +1143,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         # Preserve the order of modalities if there are multiple of them
         # from the order of kwargs.
         for input_key in kwargs:
-            if input_key in ("pixel_values",
+            if input_key in ("input_image_embeds",
                              "image_embeds") and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)

From 660cfd79982bd8c93700581754ba13d4e2c9bc4a Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 01:25:34 +0800
Subject: [PATCH 21/36] minor fix

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index eac8c3fb57f6..eccc456d9bae 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -395,7 +395,7 @@ def forward(self, pixel_values: torch.FloatTensor,
         for _output_img in output_imgs:
             img_feature_proj = self.img_projection(
                 _output_img.to(target_device).to(target_dtype))
-            img_set_tensor.append(img_feature_proj)
+            img_set_tensor.append(img_feature_proj.squeeze(0))
 
         return img_set_tensor
 

From 341a8f99b5ac48eac85b3f3efe4443385a766073 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 01:38:30 +0800
Subject: [PATCH 22/36] code format

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 208 +++++++++++++++------------
 1 file changed, 120 insertions(+), 88 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index eccc456d9bae..058255c583a3 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1,29 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
-import re
 from collections.abc import Iterable, Mapping, Sequence
-from functools import lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union)
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
 import numpy.typing as npt
 import scipy.signal
 import torch
 import torch.nn as nn
-import torchvision.transforms as T
-from PIL import Image
-from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature, SequenceFeatureExtractor
-from transformers.utils import logging
+from transformers import (BatchFeature, PretrainedConfig,
+                          SequenceFeatureExtractor, SiglipVisionConfig)
 
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext)
-from vllm.inputs.data import TokenInputs, token_inputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput, get_sampler
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
@@ -31,21 +23,22 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors, MultiModalInputs)
-from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataParser,
-                                   ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems)
+                                    NestedTensors)
+from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
+                                   ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings, flatten_bn
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -65,7 +58,8 @@
 }
 
 
-def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int):
+def _get_padding_size(orig_width: int, orig_height: int, target_height: int,
+                      target_width: int):
     ratio_width = target_width / orig_width
     ratio_height = target_height / orig_height
 
@@ -485,12 +479,12 @@ def image_tokens(self) -> list[str]:
     @property
     def audio_tokens(self) -> list[str]:
         return [f"<|audio_{i+1}|>" for i in range(100)]
-    
+
     @property
     def dynamic_hd(self) -> int:
         image_processor = self.get_hf_processor().image_processor
         return image_processor.dynamic_hd
-    
+
     def get_feature_extractor(self) -> SequenceFeatureExtractor:
         return self.get_hf_processor().audio_processor
 
@@ -511,13 +505,20 @@ def get_max_audio_tokens(self) -> int:
         sr = self.get_feature_extractor().sampling_rate
         num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr)
         return self._compute_audio_embed_size(num_frames)
-    
+
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(
-            image_width=target_width, image_height=target_height)
-    
-    def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,):
+        return self.get_num_image_tokens(image_width=target_width,
+                                         image_height=target_height)
+
+    def _find_target_aspect_ratio(
+        self,
+        orig_width: int,
+        orig_height: int,
+        image_size: int,
+        max_num: int,
+        min_num: int,
+    ):
         w_crop_num = math.ceil(orig_width / float(image_size))
         h_crop_num = math.ceil(orig_height / float(image_size))
         if w_crop_num * h_crop_num > max_num:
@@ -532,7 +533,12 @@ def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_siz
             # find the closest aspect ratio to the target
             image_processor = self.get_hf_processor().image_processor
             target_aspect_ratio = image_processor.find_closest_aspect_ratio(
-                aspect_ratio, target_ratios, orig_width, orig_height, image_size,)
+                aspect_ratio,
+                target_ratios,
+                orig_width,
+                orig_height,
+                image_size,
+            )
 
             # calculate the target width and height
             target_width = image_size * target_aspect_ratio[0]
@@ -542,7 +548,7 @@ def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_siz
             target_height = image_size * h_crop_num
             target_aspect_ratio = (w_crop_num, h_crop_num)
         return target_aspect_ratio, target_height, target_width
-    
+
     def _compute_num_image_tokens(
         self,
         orig_width: int,
@@ -553,7 +559,7 @@ def _compute_num_image_tokens(
         token_compression_factor: int = 2,
     ):
         """
-        compute the number of tokens an image is expected to take up considering 
+        compute the number of tokens an image is expected to take up considering
         the image encoder architecture and exclude output features containing 
         only padding pixels
 
@@ -561,29 +567,28 @@ def _compute_num_image_tokens(
         32x32 feature map
         NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
         """
-        assert vit_image_size % vit_patch_size == 0, \
-            "vit_image_size must be divisible by vit_patch_size"
-        assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
-            "vit_image_size // vit_patch_size must be divisible by "\
-                "token_compression_factor"
+        assert vit_image_size % vit_patch_size == 0, (
+            "vit_image_size must be divisible by vit_patch_size")
+        assert (vit_image_size // vit_patch_size %
+                token_compression_factor == 0), (
+                    "vit_image_size // vit_patch_size must be divisible by "
+                    "token_compression_factor")
 
         target_aspect_ratio, target_height, target_width = (
             self._find_target_aspect_ratio(orig_width,
-                                    orig_height,
-                                    vit_image_size,
-                                    dynamic_hd_size,
-                                    min_num=1))
-        assert target_aspect_ratio[
-            0] * vit_image_size == target_width, \
-                f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
-        assert target_aspect_ratio[
-            1] * vit_image_size == target_height, \
-                f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+                                           orig_height,
+                                           vit_image_size,
+                                           dynamic_hd_size,
+                                           min_num=1))
+        assert target_aspect_ratio[0] * vit_image_size == target_width, (
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}")
+        assert target_aspect_ratio[1] * vit_image_size == target_height, (
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}")
         assert (target_height % vit_image_size == 0
                 and target_width % vit_image_size == 0)
 
-        padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height,
-                                                        target_width)
+        padding_height, padding_width = _get_padding_size(
+            orig_width, orig_height, target_height, target_width)
         assert padding_width == 0 or padding_height == 0, \
             "padding_width or padding_height must be 0"
 
@@ -614,13 +619,15 @@ def _compute_num_image_tokens(
         num_hd_patch_tokens = feat_width * feat_height
         num_hd_newline_tokens = feat_height
         vit_feature_size = vit_image_size // vit_patch_size
-        num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
+        num_global_image_tokens = (vit_feature_size //
+                                   token_compression_factor)**2
         num_sep_tokens = 1
         num_global_image_newline_tokens = \
             vit_feature_size // token_compression_factor
 
-        return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
-                num_hd_newline_tokens + num_global_image_newline_tokens)
+        return (num_global_image_tokens + num_sep_tokens +
+                num_hd_patch_tokens + num_hd_newline_tokens +
+                num_global_image_newline_tokens)
 
     def get_num_image_tokens(
         self,
@@ -632,7 +639,8 @@ def get_num_image_tokens(
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
-        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
         vit_image_size = prepro_config['vit_image_size']
         vit_patch_size = prepro_config['vit_patch_size']
         token_compression_factor = prepro_config['token_compression_factor']
@@ -640,7 +648,8 @@ def get_num_image_tokens(
         dynamic_hd_size = self.dynamic_hd
 
         image_num_tokens = self._compute_num_image_tokens(
-            image_width, image_height, 
+            image_width,
+            image_height,
             dynamic_hd_size=dynamic_hd_size,
             vit_image_size=vit_image_size,
             vit_patch_size=vit_patch_size,
@@ -654,12 +663,13 @@ def get_image_size_with_most_features(self) -> ImageSize:
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
             vision_encoder_name = SIGLIP_NAME
-        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+        prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[
+            vision_encoder_name]
         vit_image_size = prepro_config['vit_image_size']
 
         max_side = vit_image_size * self.dynamic_hd
         return ImageSize(height=max_side, width=vit_image_size)
-    
+
     def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
         """
         Compute the output size of the `extract_features` method.
@@ -694,7 +704,7 @@ def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
 
         # Return time frames (T)
         return num_frames
-    
+
     def _compute_audio_embed_size(self, audio_frames: int) -> int:
         """
         Compute the audio embedding size based on the audio frames and
@@ -703,7 +713,7 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int:
         hf_config = self.get_hf_config()
         compression_rate = hf_config.embd_layer['audio_embd_layer'][
             'compression_rate']
-        # NOTE: this is a hard-coded value but might be configurable 
+        # NOTE: this is a hard-coded value but might be configurable
         # in the future
         qformer_compression_rate = 1
         integer = audio_frames // compression_rate
@@ -713,7 +723,8 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int:
 
         integer = result // qformer_compression_rate
         remainder = result % qformer_compression_rate
-        result = integer if remainder == 0 else integer + 1  # qformer compression
+        # qformer compression
+        result = integer if remainder == 0 else integer + 1
 
         return result
 
@@ -736,15 +747,16 @@ def get_dummy_processor_inputs(
             self._get_dummy_images(width=target_width,
                                    height=target_height,
                                    num_images=num_images),
-            "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
-                                            num_audios=num_audios),
+            "audio":
+            self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE,
+                                   num_audios=num_audios),
         }
 
         image_tokens: list[str] = self.info.image_tokens[:num_images]
         audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
 
         return ProcessorInputs(
-            prompt_text="".join(image_tokens+audio_tokens),
+            prompt_text="".join(image_tokens + audio_tokens),
             mm_data=mm_data,
         )
 
@@ -760,13 +772,16 @@ def scipy_resample_audio(
             target_sr: float,
         ):
             if orig_sr > target_sr:
-                return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+                return scipy.signal.resample_poly(audio, 1,
+                                                  orig_sr // target_sr)
             elif orig_sr < target_sr:
-                return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+                return scipy.signal.resample_poly(audio, target_sr // orig_sr,
+                                                  1)
             return audio
 
         feature_extractor = self.info.get_feature_extractor()
-        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, resample_func=scipy_resample_audio)
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
+                                    resample_func=scipy_resample_audio)
 
     def _call_hf_processor(
         self,
@@ -783,18 +798,26 @@ def _call_hf_processor(
         if (audio_data := mm_data.get("audios", [])):
             mm_data['audios'] = [(data, sr) for data in audio_data]
 
-        processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs)
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
 
         num_img_tokens = [
-            self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1])
+            self.info.get_num_image_tokens(image_width=img_size[0],
+                                           image_height=img_size[1])
             for img_size in processed_outputs["image_sizes"]
         ]
         processed_outputs["num_img_tokens"] = num_img_tokens
 
         audio_features = processed_outputs['input_audio_embeds']
-        feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data]
-        processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)]
-    
+        feature_sizes = [
+            self.info.get_audio_num_frames(len(audio), sr)
+            for audio in audio_data
+        ]
+        processed_outputs['input_audio_embeds'] = [
+            audio_features[idx, :size]
+            for idx, size in enumerate(feature_sizes)
+        ]
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -843,8 +866,10 @@ def get_audio_replacement_phi4mm(item_idx: int):
             audios = mm_items.get_items("audio", AudioProcessorItems)
             # TODO(Isotr0py): support embedding inputs
             audio_len = audios.get_audio_length(item_idx)
-            audio_frames = self.info.get_audio_num_frames(audio_len, feature_extractor.sampling_rate)
-            audio_embed_size = self.info._compute_audio_embed_size(audio_frames)
+            audio_frames = self.info.get_audio_num_frames(
+                audio_len, feature_extractor.sampling_rate)
+            audio_embed_size = self.info._compute_audio_embed_size(
+                audio_frames)
 
             audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
 
@@ -1026,10 +1051,12 @@ def _parse_and_validate_audio_input(
             assert isinstance(audio_embed_sizes, torch.Tensor)
             if isinstance(audio_features, torch.Tensor):
                 assert audio_features.size(0) == len(audio_embed_sizes), (
-                    "audio_features and audio_embed_sizes must have the same length")
+                    "audio_features and audio_embed_sizes "
+                    "must have the same length")
             elif is_list_of(audio_features, (torch.Tensor, list)):
                 assert len(audio_features) == len(audio_embed_sizes), (
-                    "audio_features and audio_embed_sizes must have the same length")
+                    "audio_features and audio_embed_sizes "
+                    "must have the same length")
             else:
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
@@ -1047,8 +1074,7 @@ def _parse_and_validate_audio_input(
 
         raise AssertionError("This line should be unreachable.")
 
-    def _process_audio_input(self,
-                             audio_input: Phi4MMAudioInputs,
+    def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
                              audio_projection_mode: str) -> NestedTensors:
         """
         Create the audio embeddings from the audio input, where the audio input
@@ -1069,10 +1095,12 @@ def _process_audio_input(self,
         # (e.g. multiple audios in the same example)
 
         dtype = next(self.embed_tokens_extend.parameters()).dtype
-        audio_embeds = [self.embed_tokens_extend.get_audio_features(
-            features.unsqueeze(0).to(dtype),
-            audio_projection_mode=audio_projection_mode,
-        ).squeeze(0) for features in audio_features]
+        audio_embeds = [
+            self.embed_tokens_extend.get_audio_features(
+                features.unsqueeze(0).to(dtype),
+                audio_projection_mode=audio_projection_mode,
+            ).squeeze(0) for features in audio_features
+        ]
         return audio_embeds
 
     def _parse_and_validate_image_input(self,
@@ -1088,7 +1116,8 @@ def _parse_and_validate_image_input(self,
               and num_img_tokens is not None, "Missing image inputs"
 
         if is_list_of(input_image_embeds, torch.Tensor):
-            assert all(p.dim() == 5 for p in input_image_embeds), "Incorrect image inputs"
+            assert all(p.dim() == 5
+                       for p in input_image_embeds), "Incorrect image inputs"
             # list len is batch_size.
             # each tensor has dimension: num_img_per_example, num_hd_patches,
             # channels, height, width.
@@ -1153,8 +1182,9 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                     **kwargs)
 
         return modalities
-    
-    def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
+
+    def _process_image_input(
+            self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
             image_embeds = image_input["image_embeds"].type(self.visual.dtype)
         else:
@@ -1162,10 +1192,10 @@ def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torc
             pixel_values = image_input['data'].to(dtype)
             image_sizes = image_input['image_sizes']
             image_attention_mask = image_input['image_attention_mask']
-            image_embeds = self.vision_encoder(
-                pixel_values, image_sizes, image_attention_mask)
+            image_embeds = self.vision_encoder(pixel_values, image_sizes,
+                                               image_attention_mask)
         return image_embeds
-    
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
@@ -1189,7 +1219,8 @@ def get_multimodal_embeddings(
                 multimodal_embeddings += tuple(vision_embeddings)
             if modality == "audios":
                 audio_input = modalities["audios"]
-                audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode)
+                audio_embeddings = self._process_audio_input(
+                    audio_input, audio_projection_mode=audio_projection_mode)
                 multimodal_embeddings += tuple(audio_embeddings)
 
         return multimodal_embeddings
@@ -1205,7 +1236,7 @@ def get_input_embeddings(
                 input_ids, inputs_embeds, multimodal_embeddings,
                 [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
         return inputs_embeds
-    
+
     def get_input_embeddings_v0(
         self,
         input_ids: torch.Tensor,
@@ -1225,7 +1256,8 @@ def get_input_embeddings_v0(
             audio_projection_mode = 'vision'
 
         if audio_input is not None:
-            audio_embeds = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode)
+            audio_embeds = self._process_audio_input(
+                audio_input, audio_projection_mode=audio_projection_mode)
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,

From 335a29ee53ee361d9bc077bda494f56095e912e8 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 02:02:43 +0800
Subject: [PATCH 23/36] refactor audio resample

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 19 +----------
 vllm/multimodal/audio.py             | 51 ++++++++++++++++++++++++++--
 vllm/multimodal/parse.py             | 30 ++++++++--------
 3 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 058255c583a3..4c619c13aaca 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -4,8 +4,6 @@
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 
 import numpy as np
-import numpy.typing as npt
-import scipy.signal
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, PretrainedConfig,
@@ -764,24 +762,9 @@ def get_dummy_processor_inputs(
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-
-        def scipy_resample_audio(
-            audio: npt.NDArray[np.floating],
-            *,
-            orig_sr: float,
-            target_sr: float,
-        ):
-            if orig_sr > target_sr:
-                return scipy.signal.resample_poly(audio, 1,
-                                                  orig_sr // target_sr)
-            elif orig_sr < target_sr:
-                return scipy.signal.resample_poly(audio, target_sr // orig_sr,
-                                                  1)
-            return audio
-
         feature_extractor = self.info.get_feature_extractor()
         return MultiModalDataParser(target_sr=feature_extractor.sampling_rate,
-                                    resample_func=scipy_resample_audio)
+                                    audio_resample_method="scipy")
 
     def _call_hf_processor(
         self,
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index f379ec1682a3..0359a1324ef1 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import base64
 from io import BytesIO
 from pathlib import Path
+from typing import Literal, Optional
 
 import numpy as np
 import numpy.typing as npt
+import scipy.signal
 
 from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
@@ -43,7 +44,7 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
             "There is no default maximum multimodal tokens")
 
 
-def resample_audio(
+def resample_audio_librosa(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
@@ -52,6 +53,52 @@ def resample_audio(
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
 
 
+def resample_audio_scipy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+):
+    if orig_sr > target_sr:
+        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
+    elif orig_sr < target_sr:
+        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
+    return audio
+
+
+class AudioResampler:
+    """Resample audio data to a target sample rate."""
+
+    def __init__(
+        self,
+        target_sr: Optional[float] = None,
+        method: Literal["librosa", "scipy"] = "librosa",
+    ):
+        self.target_sr = target_sr
+        self.method = method
+
+    def resample(
+        self,
+        audio: npt.NDArray[np.floating],
+        *,
+        orig_sr: float,
+    ) -> npt.NDArray[np.floating]:
+        if self.target_sr is None:
+            raise RuntimeError("Audio resampling is not supported when "
+                               "`target_sr` is not provided")
+        if self.method == "librosa":
+            return resample_audio_librosa(audio,
+                                          orig_sr=orig_sr,
+                                          target_sr=self.target_sr)
+        elif self.method == "scipy":
+            return resample_audio_scipy(audio,
+                                        orig_sr=orig_sr,
+                                        target_sr=self.target_sr)
+        else:
+            raise ValueError(f"Invalid resampling method: {self.method}. "
+                             "Supported methods are 'librosa' and 'scipy'.")
+
+
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index e6963bd8e158..9707b9cfcf8b 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -3,8 +3,8 @@
 from abc import ABC, abstractmethod
 from collections import UserDict
 from collections.abc import Callable, Iterator, Mapping, Sequence
-from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar,
-                    Union)
+from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional,
+                    TypeVar, Union)
 
 import numpy as np
 import torch
@@ -14,7 +14,7 @@
 
 from vllm.utils import is_list_of
 
-from .audio import resample_audio
+from .audio import AudioResampler
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
                      ImageItem, ModalityData, MultiModalDataDict,
                      MultiModalFieldConfig, MultiModalKwargs, VideoItem)
@@ -308,11 +308,18 @@ class MultiModalDataParser:
             items to the model's expected sampling rate.
     """
 
-    def __init__(self, *, target_sr: Optional[float] = None, resample_func: Optional[Callable] = None,) -> None:
+    def __init__(
+        self,
+        *,
+        target_sr: Optional[float] = None,
+        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+    ) -> None:
         super().__init__()
 
-        self.target_sr = target_sr
-        self.audio_resampler = resample_audio if resample_func is None else resample_func
+        self.audio_resampler = AudioResampler(
+            target_sr=target_sr,
+            method=audio_resample_method,
+        )
 
     def _is_embeddings(
             self, data: object
@@ -375,15 +382,8 @@ def _parse_audio_data(
             if orig_sr is None:
                 new_audio = audio
             else:
-                target_sr = self.target_sr
-                if target_sr is None:
-                    raise RuntimeError(
-                        "Audio resampling is not supported when "
-                        "`target_sr` is not provided")
-
-                new_audio = self.audio_resampler(audio,
-                                           orig_sr=orig_sr,
-                                           target_sr=target_sr)
+                new_audio = self.audio_resampler.resample(audio,
+                                                          orig_sr=orig_sr)
 
             new_audios.append(new_audio)
 

From e755e6b9d40f230c8bb1103e90de12a456ad938d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 02:20:50 +0800
Subject: [PATCH 24/36] minor refactor audio encoder

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py       | 51 ++-------------
 vllm/model_executor/models/phi4mm_audio.py | 75 ++++++----------------
 2 files changed, 23 insertions(+), 103 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 4c619c13aaca..759474e6b3ee 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -428,8 +428,8 @@ class Phi4MMImageEmbeddingInputs(TypedDict):
 
 class Phi4MMAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: Tuple[NestedTensors]
-    """Shape: `((batch_size * num_audios, 80, M), )"""
+    data: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size * num_audios, 80, M)"""
 
 
 class Phi4MMAudioEmbeddingInputs(TypedDict):
@@ -969,47 +969,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
-    def _audio_features_to_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        input_features: List[torch.Tensor],
-        audio_input_sizes: torch.Tensor,
-        audio_projection_mode: str,
-    ) -> torch.Tensor:
-        """
-        Convert audio features to embeddings, which are used as input to the 
-        model (via `inputs_embeds`).
-
-        Args:
-            input_ids (torch.Tensor): Input IDs (the prompt in this case).
-            input_features (list[torch.Tensor]): Input features (the audio 
-            embeddings).
-            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
-            audio embed lengths to use for padding the audio placeholder token 
-            in the input prompt IDs).
-        """
-        # The audio projection can either be a single linear or Sequential,
-        # so handle both cases
-        if isinstance(self.embed_tokens_extend.audio_projection,
-                      nn.Sequential):
-            target_dtype = self.embed_tokens_extend.audio_projection[
-                0].bias.dtype
-        else:
-            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
-
-        audio_input = [
-            input.unsqueeze(0).to(target_dtype) for input in input_features
-        ]
-        kwargs = {
-            "wte": self.model.embed_tokens,
-            'audio_projection_mode': audio_projection_mode
-        }
-        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
-                                                    audio_input_sizes,
-                                                    **kwargs)
-        audio_embeddings = audio_embeddings.to(target_dtype)
-        return audio_embeddings
-
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
         """
@@ -1079,10 +1038,10 @@ def _process_audio_input(self, audio_input: Phi4MMAudioInputs,
 
         dtype = next(self.embed_tokens_extend.parameters()).dtype
         audio_embeds = [
-            self.embed_tokens_extend.get_audio_features(
-                features.unsqueeze(0).to(dtype),
+            self.embed_tokens_extend(
+                features.to(dtype),
                 audio_projection_mode=audio_projection_mode,
-            ).squeeze(0) for features in audio_features
+            ) for features in audio_features
         ]
         return audio_embeds
 
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index db90848f9809..34a7a73d057a 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -1159,8 +1159,11 @@ def get_audio_features(
         input_embeds: torch.FloatTensor,
         audio_attention_mask: torch.Tensor = None,
         audio_projection_mode: str = "speech",
-    ):
-
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
         if self.freeze_audio_processor:
             with torch.no_grad():
                 audio_features, masks = self.encoder(input_embeds,
@@ -1210,62 +1213,20 @@ def get_audio_features(
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        input_embeds: torch.FloatTensor,
-        audio_embed_sizes,
-        **kwargs,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
     ) -> torch.FloatTensor:
         """
         arguments:
-            input_ids: input text ids (B, U)
-            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+            audio_features: audio features (T, D)
+        
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
         """
-        assert input_embeds is not None and len(input_embeds) == len(
-            audio_embed_sizes)
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        with torch.no_grad():
-            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
-                as_tuple=False)
-
-        if not isinstance(input_embeds, list):
-            input_embeds = [input_embeds]
-
-        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
-        audio_set_tensor = [
-            self.get_audio_features(
-                input_embed, audio_projection_mode=audio_projection_mode)
-            for input_embed in input_embeds
-        ]
-
-        with torch.no_grad():
-            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
-
-        if "wte" in kwargs:
-            # we use the token embedding layer from the huggingface model, this
-            # is REQUIRED to make sure we are using the loaded weights.
-            hidden_states = kwargs["wte"](input_ids)
-        else:
-            # otherwise, we use token embedding in pretrained mixformer from
-            # phi team
-            hidden_states = self.wte(input_ids)
-
-        if len(positions.tolist()) > 0:
-            assert sum(audio_embed_sizes) == len(
-                positions
-            ), "please ensure the encoder outputs have the same length as"\
-                " defined in input_ids!"
-            idx = 0
-            for i in range(len(audio_embed_sizes)):
-                cnt = audio_embed_sizes[i]
-                assert audio_set_tensor[i].shape[0] == 1
-                hidden_states[
-                    positions[idx, 0],
-                    positions[idx, 1]:positions[idx, 1] + cnt,
-                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
-                    hidden_states.dtype).to(hidden_states.device))
-                idx += cnt
-
-        return hidden_states
+        audio_embeds = self.get_audio_features(
+            audio_features.unsqueeze(0),
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds.squeeze(0)

From 5714c181d542cc6242ddff82701c4c0c33d527bb Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 3 Apr 2025 16:15:41 +0800
Subject: [PATCH 25/36] increase test max_model_len

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/decoder_only/vision_language/test_phi4mm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index c3e88b60978a..94ddb734b8f2 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -155,7 +155,7 @@ def run_test(
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_model_len", [12800])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@@ -198,7 +198,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     ],
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_model_len", [25600])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.xfail(

From d3dd9e0e1d633834a8c34f5184734f4138886a96 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 4 Apr 2025 00:15:54 +0800
Subject: [PATCH 26/36] add processor tests

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/multimodal/processing/test_common.py | 1 +
 vllm/model_executor/models/phi4mm.py              | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index fdcd7a9e1738..51b961785c3a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -271,6 +271,7 @@ def _test_processing_correctness_mistral(
     "nvidia/NVLM-D-72B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-4-multimodal-instruct",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 759474e6b3ee..71b7cba56fdb 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -815,7 +815,6 @@ def _get_mm_fields_config(
             num_img_tokens=MultiModalFieldConfig.batched("image"),
             input_audio_embeds=MultiModalFieldConfig.batched("audio"),
             audio_embed_sizes=MultiModalFieldConfig.batched("audio"),
-            audio_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_updates(

From 5a505b8634847abba6aef2b192d42c1b0dc341dc Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 4 Apr 2025 00:30:55 +0800
Subject: [PATCH 27/36] revert unnecessary changes

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/models/phi4mm.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 71b7cba56fdb..435e30e3166b 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -814,7 +814,6 @@ def _get_mm_fields_config(
             image_sizes=MultiModalFieldConfig.batched("image"),
             num_img_tokens=MultiModalFieldConfig.batched("image"),
             input_audio_embeds=MultiModalFieldConfig.batched("audio"),
-            audio_embed_sizes=MultiModalFieldConfig.batched("audio"),
         )
 
     def _get_prompt_updates(
@@ -982,23 +981,13 @@ def _parse_and_validate_audio_input(
             Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
         """
         audio_features = kwargs.pop("input_audio_embeds", None)
-        audio_embed_sizes = kwargs.pop("audio_embed_sizes", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
         if audio_features is not None:
-            assert isinstance(audio_embed_sizes, torch.Tensor)
-            if isinstance(audio_features, torch.Tensor):
-                assert audio_features.size(0) == len(audio_embed_sizes), (
-                    "audio_features and audio_embed_sizes "
-                    "must have the same length")
-            elif is_list_of(audio_features, (torch.Tensor, list)):
-                assert len(audio_features) == len(audio_embed_sizes), (
-                    "audio_features and audio_embed_sizes "
-                    "must have the same length")
-            else:
+            if not isinstance(audio_features, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
 

From b40b458f0e052a03b74f9578eeef35ef35d3d166 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 6 Apr 2025 12:52:40 +0800
Subject: [PATCH 28/36] add scipy to doc requirement

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 requirements/docs.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/docs.txt b/requirements/docs.txt
index 416ca503b36c..99fb87def6dd 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -18,6 +18,7 @@ transformers
 mistral_common >= 1.5.4
 aiohttp
 starlette
+scipy
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
 partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

From 4f3049d859ce04dc1c5f2f3e8aadef16ea23de02 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 6 Apr 2025 18:08:29 +0800
Subject: [PATCH 29/36] fix doc build

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/multimodal/audio.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 0359a1324ef1..70a912c9c9ef 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import scipy.signal
 
 from vllm.inputs.registry import InputContext
 from vllm.utils import PlaceholderModule
@@ -59,6 +58,9 @@ def resample_audio_scipy(
     orig_sr: float,
     target_sr: float,
 ):
+    # lazy import scipy.signal, otherwise it will crash doc build.
+    import scipy.signal
+
     if orig_sr > target_sr:
         return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
     elif orig_sr < target_sr:

From 6cce3fe2dee0ce901df746baa29f5f5aa541760a Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 8 Apr 2025 22:35:53 +0800
Subject: [PATCH 30/36] init vision speech test

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../vision_language/test_phi4mm.py            | 88 +++++++++++++++++--
 1 file changed, 79 insertions(+), 9 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index 94ddb734b8f2..e1d88c24acb2 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -2,18 +2,22 @@
 
 import os
 import re
+from collections.abc import Sequence
 from typing import Optional
 
 import pytest
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
+                          PromptImageInput, VllmRunner)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
@@ -29,6 +33,7 @@
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
+speech_lora_path = os.path.join(model_path, "speech-lora")
 models = [model_path]
 
 
@@ -64,7 +69,8 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
 def run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
-    inputs: list[tuple[list[str], PromptImageInput]],
+    inputs: Sequence[tuple[list[str], PromptImageInput,
+                           Optional[PromptAudioInput]]],
     model: str,
     *,
     max_model_len: int,
@@ -105,27 +111,52 @@ def run_test(
     ) as vllm_model:
         lora_request = LoRARequest("vision", 1, vision_lora_path)
         vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
+        if any(audios is not None for _, _, audios in inputs):
+            lora_request = LoRARequest("speech", 2, speech_lora_path)
+            vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
         vllm_outputs_per_case = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=images)
-            for prompts, images in inputs
+                                                images=images,
+                                                audios=audios)
+            for prompts, images, audios in inputs
         ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+
+        hf_processor = hf_model.processor
+        eos_token_id = hf_processor.tokenizer.eos_token_id
+
+        def patch_hf_processor(*args,
+                               text="",
+                               images=None,
+                               audio=None,
+                               sampling_rate=None,
+                               **kwargs):
+            audios = None
+            if audio is not None and sampling_rate is not None:
+                audios = [(audio, sampling_rate)]
+            return hf_processor(*args,
+                                text=text,
+                                images=images,
+                                audios=audios,
+                                **kwargs)
+
+        hf_model.processor = patch_hf_processor
+
         hf_outputs_per_case = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=images,
+                                                    audios=audios,
                                                     eos_token_id=eos_token_id,
                                                     num_logits_to_keep=0)
-            for prompts, images in inputs
+            for prompts, images, audios in inputs
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
@@ -166,6 +197,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     inputs_per_image = [(
         [prompt for _ in size_factors],
         [rescale_image_size(image, factor) for factor in size_factors],
+        None,
     ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     run_test(
@@ -209,9 +241,12 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_case = [
-        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-         [[rescale_image_size(image, factor) for image in images]
-          for factor in size_factors])
+        (
+            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+            [[rescale_image_size(image, factor) for image in images]
+             for factor in size_factors],
+            None,
+        ),
     ]
 
     run_test(
@@ -226,3 +261,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
         mm_limit=2,
         tensor_parallel_size=1,
     )
+
+
+# FIXME(Isotr0py): This test can't stll pass yet.
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [12800])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
+                              max_model_len: int, max_tokens: int,
+                              num_logprobs: int) -> None:
+
+    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
+    image = ImageAsset("stop_sign").pil_image.convert("RGB")
+
+    inputs_vision_speech = [
+        (
+            ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
+            [image],
+            [audio],
+        ),
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_vision_speech,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )

From a54dae3811624d37b11ee45025f6b504006450cb Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 8 Apr 2025 23:53:40 +0800
Subject: [PATCH 31/36] make vision speech test passed

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .../models/decoder_only/vision_language/test_phi4mm.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index e1d88c24acb2..b6bb01e002ac 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -5,11 +5,11 @@
 from collections.abc import Sequence
 from typing import Optional
 
+import librosa
 import pytest
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
-from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
@@ -34,6 +34,8 @@
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
 speech_lora_path = os.path.join(model_path, "speech-lora")
+speech_question = os.path.join(model_path, "examples",
+                               "what_is_shown_in_this_image.wav")
 models = [model_path]
 
 
@@ -263,7 +265,6 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
     )
 
 
-# FIXME(Isotr0py): This test can't stll pass yet.
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_model_len", [12800])
@@ -273,8 +274,9 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
                               max_model_len: int, max_tokens: int,
                               num_logprobs: int) -> None:
 
-    audio = AudioAsset("mary_had_lamb").audio_and_sample_rate
-    image = ImageAsset("stop_sign").pil_image.convert("RGB")
+    # use the example speech question so that the model outputs are reasonable
+    audio = librosa.load(speech_question, sr=None)
+    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
     inputs_vision_speech = [
         (

From 516d9da20bdfb712b135525cb68789a6424e98df Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Wed, 9 Apr 2025 17:55:42 +0800
Subject: [PATCH 32/36] fix ultravox test import

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/decoder_only/audio_language/test_ultravox.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index a843e41aa26e..449b93b6fdcc 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -7,7 +7,7 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer
 
-from vllm.multimodal.audio import resample_audio
+from vllm.multimodal.audio import resample_audio_librosa
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import HfRunner, VllmRunner
@@ -135,9 +135,9 @@ def run_test(
                 [hf_prompt],
                 max_tokens,
                 num_logprobs=num_logprobs,
-                audios=[(resample_audio(audio[0],
-                                        orig_sr=audio[1],
-                                        target_sr=16000), 16000)])
+                audios=[(resample_audio_librosa(audio[0],
+                                                orig_sr=audio[1],
+                                                target_sr=16000), 16000)])
             for _, hf_prompt, audio in prompts_and_audios
         ]
 

From faa14d5ec7da15d2f08296a577af44ee3fa6fb6e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 9 Apr 2025 14:30:44 +0000
Subject: [PATCH 33/36] Fix online inference

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/chat_utils.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 11c759a6174e..9ddc6d8d9346 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -480,11 +480,8 @@ def _placeholder_str(self, modality: ModalityStr,
         if modality in ("image", "image_embeds"):
             if model_type == "chatglm":
                 return "<|begin_of_image|><|endoftext|><|end_of_image|>"
-            if model_type == "phi3_v":
-                # Workaround since this token is not defined in the tokenizer
+            if model_type in ("phi3_v", "phi4mm"):
                 return f"<|image_{current_count}|>"
-            if model_type == "phi4mm":
-                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "florence2", "fuyu", "paligemma",
@@ -516,7 +513,7 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "ultravox":
                 return "<|audio|>"
             if model_type == "phi4mm":
-                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
+                return f"<|audio_{current_count}|>"
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")

From 5ddf5746cab538721d920ed490d528a1338a439d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 14 Apr 2025 22:52:20 +0800
Subject: [PATCH 34/36] expose dynamic_hd

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/vision_language.py |  2 +
 .../vision_language_multi_image.py            |  2 +
 .../multimodal/processing/test_phi4mm.py      | 59 +++++++++++++++++++
 vllm/model_executor/models/phi3v.py           |  2 +-
 vllm/model_executor/models/phi4mm.py          | 36 ++++++++---
 5 files changed, 92 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/multimodal/processing/test_phi4mm.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 20b243fc3ccb..51a44fb21cda 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -793,6 +793,8 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 16},
     )
 
     return ModelRequestData(
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 3547afd3019a..981d15d1415e 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -470,6 +470,8 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
+        # Note - mm_processor_kwargs can also be passed to generate/chat calls
+        mm_processor_kwargs={"dynamic_hd": 4},
     )
 
     placeholders = "".join(f"<|image_{i}|>"
diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py
new file mode 100644
index 000000000000..797986adba4a
--- /dev/null
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for phi4mm's multimodal preprocessing kwargs."""
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"])
+# yapf: disable
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_toks_per_img"),
+    [
+        ({"dynamic_hd": 4}, 1329),
+        ({"dynamic_hd": 16}, 4433),
+        # the default num_crops of phi-4-multimodal is 36
+        ({}, 9585),
+    ])
+# yapf: enable
+@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model_id: str,
+    mm_processor_kwargs: dict[str, int],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
+    """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly."""
+    # Avoid initializing CUDA early
+    from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    # Build the image str / prompt based on the number of images we pass
+    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
+    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
+
+    image_size = ctx.get_hf_config(
+    ).embd_layer["image_embd_layer"]["crop_size"]
+    dummy_image_size = (image_size * 7, image_size * 7)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    img_tok_count = processed_inputs["prompt_token_ids"].count(
+        _IMAGE_PLACEHOLDER_TOKEN_ID)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7f41ad2359df..5b43871b7591 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -327,7 +327,7 @@ def get_num_image_tokens(
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         if processor is None:
             processor = self.get_hf_processor()
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 2da55f577b8f..1a56e6826a24 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -6,7 +6,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (BatchFeature, PretrainedConfig,
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
                           SequenceFeatureExtractor, SiglipVisionConfig)
 
 from vllm.config import VllmConfig
@@ -48,7 +48,6 @@
 SIGLIP_NAME = "siglip-so400m-patch14-448"
 VISION_ENCODER_TO_PROCESSING_CONFIG = {
     'siglip-so400m-patch14-448': {
-        'dynamic_hd': 16,
         'vit_image_size': 448,
         'vit_patch_size': 14,
         'token_compression_factor': 2,
@@ -470,6 +469,17 @@ def cat_with_pad(tensors, dim, padding_value=0):
 
 class Phi4MMProcessingInfo(BaseProcessingInfo):
 
+    def get_hf_processor(
+        self,
+        *,
+        dynamic_hd: Optional[int] = None,
+        **kwargs: object,
+    ) -> ProcessorMixin:
+        if dynamic_hd is not None:
+            kwargs["dynamic_hd"] = dynamic_hd
+
+        return self.ctx.get_hf_processor(**kwargs)
+
     @property
     def image_tokens(self) -> list[str]:
         return [f"<|image_{i+1}|>" for i in range(100)]
@@ -478,9 +488,13 @@ def image_tokens(self) -> list[str]:
     def audio_tokens(self) -> list[str]:
         return [f"<|audio_{i+1}|>" for i in range(100)]
 
-    @property
-    def dynamic_hd(self) -> int:
-        image_processor = self.get_hf_processor().image_processor
+    def get_dynamic_hd(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+        image_processor = processor.image_processor
         return image_processor.dynamic_hd
 
     def get_feature_extractor(self) -> SequenceFeatureExtractor:
@@ -632,6 +646,7 @@ def get_num_image_tokens(
         *,
         image_width: int,
         image_height: int,
+        processor: Optional[ProcessorMixin] = None,
     ) -> int:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
@@ -643,7 +658,7 @@ def get_num_image_tokens(
         vit_patch_size = prepro_config['vit_patch_size']
         token_compression_factor = prepro_config['token_compression_factor']
 
-        dynamic_hd_size = self.dynamic_hd
+        dynamic_hd_size = self.get_dynamic_hd(processor=processor)
 
         image_num_tokens = self._compute_num_image_tokens(
             image_width,
@@ -656,7 +671,10 @@ def get_num_image_tokens(
 
         return image_num_tokens
 
-    def get_image_size_with_most_features(self) -> ImageSize:
+    def get_image_size_with_most_features(
+        self,
+        processor: Optional[ProcessorMixin] = None,
+    ) -> ImageSize:
         hf_config = self.get_hf_config()
         vision_encoder_name = hf_config.img_processor
         if vision_encoder_name is None:
@@ -665,7 +683,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
             vision_encoder_name]
         vit_image_size = prepro_config['vit_image_size']
 
-        max_side = vit_image_size * self.dynamic_hd
+        max_side = vit_image_size * self.get_dynamic_hd(processor=processor)
         return ImageSize(height=max_side, width=vit_image_size)
 
     def get_audio_num_frames(self, audio_len: int, sr: float) -> int:
@@ -825,6 +843,7 @@ def _get_prompt_updates(
         image_tokens: list[str] = self.info.image_tokens  # type: ignore
         audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
         feature_extractor = self.info.get_feature_extractor()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         def get_image_replacement_phi4mm(item_idx: int):
             images = mm_items.get_items(
@@ -837,6 +856,7 @@ def get_image_replacement_phi4mm(item_idx: int):
                 num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    processor=hf_processor,
                 )
 
             image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens

From e9724c89deef3e70e8d1634fe3b7e73949f410d5 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Tue, 15 Apr 2025 13:56:03 +0800
Subject: [PATCH 35/36] reduce max_model_len in example to fit single gpu

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/vision_language.py             | 2 +-
 examples/offline_inference/vision_language_multi_image.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 1a1118b386a6..80bf5255f32a 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -814,7 +814,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=12800,
+        max_model_len=5120,
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index a2b57e52eb84..976943dff7a9 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -504,7 +504,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
-        max_model_len=25600,
+        max_model_len=4096,
         max_num_seqs=2,
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,

From 11be4863af3b8deeac2dbf70f130655047d4d131 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sat, 19 Apr 2025 15:08:30 +0800
Subject: [PATCH 36/36] update profiler and fix ultravox tests

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference/vision_language.py |  1 +
 .../audio_language/test_ultravox.py           | 19 ++++++++----
 vllm/model_executor/models/phi4mm.py          | 30 +++++++++++--------
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 6b31eee95af4..bd7035b7615a 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -816,6 +816,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=5120,
         max_num_seqs=2,
+        max_num_batched_tokens=12800,
         enable_lora=True,
         max_lora_rank=320,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 9adf51015817..e9dcba8ec089 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Optional
+from typing import Any, Optional
 
 import numpy as np
 import pytest
@@ -43,6 +43,18 @@ def audio(request):
     return AudioAsset(request.param)
 
 
+def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
+    """Convert kwargs to CLI args."""
+    args = []
+    for key, value in params_kwargs.items():
+        if isinstance(value, bool):
+            if value:
+                args.append(f"--{key.replace('_','-')}")
+        else:
+            args.append(f"--{key.replace('_','-')}={value}")
+    return args
+
+
 @pytest.fixture(params=[
     pytest.param({}, marks=pytest.mark.cpu_model),
     pytest.param(CHUNKED_PREFILL_KWARGS),
@@ -52,10 +64,7 @@ def server(request, audio_assets):
         "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
         "--limit-mm-per-prompt",
         json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
-    ] + [
-        f"--{key.replace('_','-')}={value}"
-        for key, value in request.param.items()
-    ]
+    ] + params_kwargs_to_cli_args(request.param)
 
     with RemoteOpenAIServer(MODEL_NAME,
                             args,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 1a56e6826a24..cdd762f5fec3 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -20,15 +20,15 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems,
                                    ImageProcessorItems, ImageSize,
                                    MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
                                         PromptUpdate)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -747,14 +747,26 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int:
 
 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]):
 
-    def get_dummy_processor_inputs(
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_audios = mm_counts.get("audio", 0)
+        num_images = mm_counts.get("image", 0)
+
+        image_tokens: list[str] = self.info.image_tokens[:num_images]
+        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
+
+        return "".join(image_tokens + audio_tokens)
+
+    def get_dummy_mm_data(
         self,
         seq_len: int,
         mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
+    ) -> MultiModalDataDict:
         num_audios = mm_counts.get("audio", 0)
         num_images = mm_counts.get("image", 0)
 
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
@@ -768,13 +780,7 @@ def get_dummy_processor_inputs(
                                    num_audios=num_audios),
         }
 
-        image_tokens: list[str] = self.info.image_tokens[:num_images]
-        audio_tokens: list[str] = self.info.audio_tokens[:num_audios]
-
-        return ProcessorInputs(
-            prompt_text="".join(image_tokens + audio_tokens),
-            mm_data=mm_data,
-        )
+        return mm_data
 
 
 class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):