From ae475194d39104248c081a6ba0cbd7e5af5d2202 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 25 Mar 2025 23:24:30 +0800 Subject: [PATCH 01/36] init phi4mm multimodal processor Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 314 +++++++++++++++++++++++++-- 1 file changed, 290 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 3d4505d556e2..37a0c30c9bbc 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import math import re +from collections.abc import Iterable, Mapping, Sequence from functools import lru_cache -from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) import numpy as np @@ -11,7 +12,7 @@ import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig, SiglipVisionConfig +from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature from transformers.utils import logging from vllm.config import VllmConfig @@ -28,7 +29,14 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalInputs, NestedTensors +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors, MultiModalInputs) +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptUpdate, PromptUpdateDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config @@ -121,8 +129,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, return best_ratio -def _find_target_aspect_ratio(image, image_size, max_num, min_num): - orig_width, orig_height = image.size +def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,): w_crop_num = math.ceil(orig_width / float(image_size)) h_crop_num = math.ceil(orig_height / float(image_size)) @@ -150,8 +157,7 @@ def _find_target_aspect_ratio(image, image_size, max_num, min_num): return target_aspect_ratio, target_height, target_width -def _get_padding_size(image, target_height, target_width): - orig_width, orig_height = image.size +def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int): ratio_width = target_width / orig_width ratio_height = target_height / orig_height @@ -169,14 +175,14 @@ def dynamic_preprocess(image, max_num=12, image_size=384, mask_size=27): + orig_width, orig_height = image.size target_aspect_ratio, target_height, target_width =\ _find_target_aspect_ratio( - image, image_size, max_num, min_num) + orig_width, orig_height, image_size, max_num, min_num) padding_height, padding_width = _get_padding_size(image, target_height, target_width) # Calculate the ratio - orig_width, orig_height = image.size ratio_width = target_width / orig_width ratio_height = target_height / orig_height if ratio_width < ratio_height: @@ -858,8 +864,14 @@ def audio_feature_extractor() -> LogFbankProcessor: return LogFbankProcessor() -def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, - vit_patch_size, token_compression_factor): +def _compute_num_image_tokens( + orig_width: int, + orig_height: int, + dynamic_hd_size: int, + vit_image_size: int, + vit_patch_size: int, + token_compression_factor: int = 2, +): """ compute the number of tokens an image is expected to take up considering the image encoder architecture and exclude output features containing @@ -876,7 +888,8 @@ def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, "token_compression_factor" target_aspect_ratio, target_height, target_width = ( - _find_target_aspect_ratio(image, + _find_target_aspect_ratio(orig_width, + orig_height, vit_image_size, dynamic_hd_size, min_num=1)) @@ -889,7 +902,7 @@ def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, assert (target_height % vit_image_size == 0 and target_width % vit_image_size == 0) - padding_height, padding_width = _get_padding_size(image, target_height, + padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height, target_width) assert padding_width == 0 or padding_height == 0, \ "padding_width or padding_height must be 0" @@ -1218,7 +1231,7 @@ def input_processor_for_phi4mm(ctx: InputContext, ) -def _compute_audio_embed_size(hf_config, audio_frames): +def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int: """ Compute the audio embedding size based on the audio frames and compression rate. @@ -1423,16 +1436,269 @@ def cat_with_pad(tensors, dim, padding_value=0): return output -@MULTIMODAL_REGISTRY.register_input_mapper("audio", - input_mapper_for_phi4mm_audio) -@MULTIMODAL_REGISTRY.register_input_mapper("image", - input_mapper_for_phi4mm_image) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "audio", get_max_phi4mm_audio_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "image", get_max_phi4mm_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm) -@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) +class Phi4MMProcessingInfo(BaseProcessingInfo): + + @property + def image_tokens(self) -> list[str]: + return [f"<|image_{i+1}|>" for i in range(100)] + + @property + def audio_tokens(self) -> list[str]: + return [f"<|audio_{i+1}|>" for i in range(100)] + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "audio": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "audio": self.get_max_audio_tokens(), + } + + def get_max_audio_tokens(self) -> int: + return 10000 + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( + image_width=target_width, image_height=target_height) + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_name = hf_config.img_processor + if vision_encoder_name is None: + vision_encoder_name = SIGLIP_NAME + prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] + dynamic_hd_size = prepro_config['dynamic_hd'] + vit_image_size = prepro_config['vit_image_size'] + vit_patch_size = prepro_config['vit_patch_size'] + token_compression_factor = prepro_config['token_compression_factor'] + + image_num_tokens = _compute_num_image_tokens( + image_width, image_height, + dynamic_hd_size=dynamic_hd_size, + vit_image_size=vit_image_size, + vit_patch_size=vit_patch_size, + token_compression_factor=token_compression_factor, + ) + + return image_num_tokens + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + vision_encoder_name = hf_config.img_processor + if vision_encoder_name is None: + vision_encoder_name = SIGLIP_NAME + prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] + dynamic_hd_size = prepro_config['dynamic_hd'] + vit_image_size = prepro_config['vit_image_size'] + + max_side = vit_image_size * dynamic_hd_size + return ImageSize(height=max_side, width=vit_image_size) + + def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, + vit_patch_size, token_compression_factor): + """ + compute the number of tokens an image is expected to take up considering + the image encoder architecture and exclude output features containing + only padding pixels + + for siglip, vit_image_size=448, vit_patch_size=14, so output will be + 32x32 feature map + NOTE right now, Phi4MM uses hard-coded token_compression_factor=2 + """ + assert vit_image_size % vit_patch_size == 0, \ + "vit_image_size must be divisible by vit_patch_size" + assert vit_image_size // vit_patch_size % token_compression_factor == 0, \ + "vit_image_size // vit_patch_size must be divisible by "\ + "token_compression_factor" + + target_aspect_ratio, target_height, target_width = ( + _find_target_aspect_ratio(image, + vit_image_size, + dynamic_hd_size, + min_num=1)) + assert target_aspect_ratio[ + 0] * vit_image_size == target_width, \ + f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}" + assert target_aspect_ratio[ + 1] * vit_image_size == target_height, \ + f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}" + assert (target_height % vit_image_size == 0 + and target_width % vit_image_size == 0) + + padding_height, padding_width = _get_padding_size(image, target_height, + target_width) + assert padding_width == 0 or padding_height == 0, \ + "padding_width or padding_height must be 0" + + target_feat_width = target_width // vit_patch_size + target_feat_height = target_height // vit_patch_size + if padding_width >= vit_patch_size: + assert padding_height == 0, "padding_height not 0" + non_pad_feat_width = target_feat_width - math.floor( + padding_width / vit_patch_size) + non_pad_feat_height = target_feat_height + elif padding_height >= vit_patch_size: + assert padding_width == 0, "padding_width not 0" + non_pad_feat_height = target_feat_height - math.floor( + padding_height / vit_patch_size) + non_pad_feat_width = target_feat_width + else: + # small padding shorter than a vit patch + non_pad_feat_width = target_feat_width + non_pad_feat_height = target_feat_height + + feat_width = non_pad_feat_width // token_compression_factor + feat_height = non_pad_feat_height // token_compression_factor + # NOTE it's possible that the non-padding feature is not divisible + if non_pad_feat_width % token_compression_factor != 0: + feat_width += 1 + if non_pad_feat_height % token_compression_factor != 0: + feat_height += 1 + num_hd_patch_tokens = feat_width * feat_height + num_hd_newline_tokens = feat_height + vit_feature_size = vit_image_size // vit_patch_size + num_global_image_tokens = (vit_feature_size // token_compression_factor)**2 + num_sep_tokens = 1 + num_global_image_newline_tokens = \ + vit_feature_size // token_compression_factor + + return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens + + num_hd_newline_tokens + num_global_image_newline_tokens) + + +class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_audios = mm_counts.get("audio", 0) + num_images = mm_counts.get("image", 0) + + target_width, target_height = \ + self.info.get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, + num_audios=num_audios), + } + + image_tokens: list[str] = self.info.image_tokens[:num_images] + audio_tokens: list[str] = self.info.audio_tokens[:num_audios] + + return ProcessorInputs( + prompt_text="".join(image_tokens + audio_tokens), + mm_data=mm_data, + ) + + +class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_tokens: list[str] = self.info.image_tokens # type: ignore + audio_tokens: list[str] = self.info.audio_tokens # type: ignore + + tokenizer = self.info.get_tokenizer() + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + def get_image_replacement_phi4mm(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + processor=hf_processor, + ) + + image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens + + return image_tokens + + def get_audio_replacement_phi4mm(item_idx: int): + audios = mm_items.get_items("audio", AudioProcessorItems) + # TODO(Isotr0py): support embedding inputs + audio_len = audios.get_audio_length(item_idx) + audio_frames, _ = compute_logfbank_output_size(audio_len, DUMMY_SAMPLING_FREQUENCY) + audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames) + + audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size + + return audio_tokens + + num_images = mm_items.get_count("image", strict=False) + num_audios = mm_items.get_count("audio", strict=False) + + image_repl = [ + PromptReplacement( + modality="image", + target=image_token, + replacement=get_image_replacement_phi4mm, + ) for image_token in image_tokens[:num_images] + ] + audio_repl = [ + PromptReplacement( + modality="image", + target=audio_token, + replacement=get_audio_replacement_phi4mm, + ) for audio_token in audio_tokens[:num_audios] + ] + return image_repl + audio_repl + + +# @MULTIMODAL_REGISTRY.register_input_mapper("audio", +# input_mapper_for_phi4mm_audio) +# @MULTIMODAL_REGISTRY.register_input_mapper("image", +# input_mapper_for_phi4mm_image) +# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( +# "audio", get_max_phi4mm_audio_tokens) +# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( +# "image", get_max_phi4mm_image_tokens) +# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm) +# @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) +@MULTIMODAL_REGISTRY.register_processor( + Phi4MMMultiModalProcessor, + info=Phi4MMProcessingInfo, + dummy_inputs=Phi4MMDummyInputsBuilder, +) class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal, SupportsV0Only): """ From 1a3e9c5825d0065bde08f7c0f5cc22bb1d60eea3 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 25 Mar 2025 23:28:07 +0800 Subject: [PATCH 02/36] remove unused func Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 71 ---------------------------- 1 file changed, 71 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 37a0c30c9bbc..36f4e16a5412 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1505,77 +1505,6 @@ def get_image_size_with_most_features(self) -> ImageSize: max_side = vit_image_size * dynamic_hd_size return ImageSize(height=max_side, width=vit_image_size) - def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, - vit_patch_size, token_compression_factor): - """ - compute the number of tokens an image is expected to take up considering - the image encoder architecture and exclude output features containing - only padding pixels - - for siglip, vit_image_size=448, vit_patch_size=14, so output will be - 32x32 feature map - NOTE right now, Phi4MM uses hard-coded token_compression_factor=2 - """ - assert vit_image_size % vit_patch_size == 0, \ - "vit_image_size must be divisible by vit_patch_size" - assert vit_image_size // vit_patch_size % token_compression_factor == 0, \ - "vit_image_size // vit_patch_size must be divisible by "\ - "token_compression_factor" - - target_aspect_ratio, target_height, target_width = ( - _find_target_aspect_ratio(image, - vit_image_size, - dynamic_hd_size, - min_num=1)) - assert target_aspect_ratio[ - 0] * vit_image_size == target_width, \ - f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}" - assert target_aspect_ratio[ - 1] * vit_image_size == target_height, \ - f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}" - assert (target_height % vit_image_size == 0 - and target_width % vit_image_size == 0) - - padding_height, padding_width = _get_padding_size(image, target_height, - target_width) - assert padding_width == 0 or padding_height == 0, \ - "padding_width or padding_height must be 0" - - target_feat_width = target_width // vit_patch_size - target_feat_height = target_height // vit_patch_size - if padding_width >= vit_patch_size: - assert padding_height == 0, "padding_height not 0" - non_pad_feat_width = target_feat_width - math.floor( - padding_width / vit_patch_size) - non_pad_feat_height = target_feat_height - elif padding_height >= vit_patch_size: - assert padding_width == 0, "padding_width not 0" - non_pad_feat_height = target_feat_height - math.floor( - padding_height / vit_patch_size) - non_pad_feat_width = target_feat_width - else: - # small padding shorter than a vit patch - non_pad_feat_width = target_feat_width - non_pad_feat_height = target_feat_height - - feat_width = non_pad_feat_width // token_compression_factor - feat_height = non_pad_feat_height // token_compression_factor - # NOTE it's possible that the non-padding feature is not divisible - if non_pad_feat_width % token_compression_factor != 0: - feat_width += 1 - if non_pad_feat_height % token_compression_factor != 0: - feat_height += 1 - num_hd_patch_tokens = feat_width * feat_height - num_hd_newline_tokens = feat_height - vit_feature_size = vit_image_size // vit_patch_size - num_global_image_tokens = (vit_feature_size // token_compression_factor)**2 - num_sep_tokens = 1 - num_global_image_newline_tokens = \ - vit_feature_size // token_compression_factor - - return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens + - num_hd_newline_tokens + num_global_image_newline_tokens) - class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): From 41e00f6240cad5c317c77b01c00ee51bd2a2bd79 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 27 Mar 2025 00:35:09 +0800 Subject: [PATCH 03/36] make image inference work Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 34 ++++++++++++++++++---------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 36f4e16a5412..8588750e144a 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1447,7 +1447,7 @@ def audio_tokens(self) -> list[str]: return [f"<|audio_{i+1}|>" for i in range(100)] def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "audio": None} + return {"image": None} def get_mm_max_tokens_per_item( self, @@ -1456,7 +1456,7 @@ def get_mm_max_tokens_per_item( ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), - "audio": self.get_max_audio_tokens(), + # "audio": self.get_max_audio_tokens(), } def get_max_audio_tokens(self) -> int: @@ -1513,7 +1513,7 @@ def get_dummy_processor_inputs( seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_audios = mm_counts.get("audio", 0) + # num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) target_width, target_height = \ @@ -1524,21 +1524,36 @@ def get_dummy_processor_inputs( self._get_dummy_images(width=target_width, height=target_height, num_images=num_images), - "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + # "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, + # num_audios=num_audios), } image_tokens: list[str] = self.info.image_tokens[:num_images] - audio_tokens: list[str] = self.info.audio_tokens[:num_audios] + # audio_tokens: list[str] = self.info.audio_tokens[:num_audios] return ProcessorInputs( - prompt_text="".join(image_tokens + audio_tokens), + prompt_text="".join(image_tokens), mm_data=mm_data, ) class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) + else: + tokenizer = self.info.get_tokenizer() + processed_outputs = tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + return processed_outputs + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -1560,10 +1575,6 @@ def _get_prompt_updates( image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore - tokenizer = self.info.get_tokenizer() - bos_token_id = tokenizer.bos_token_id - assert isinstance(bos_token_id, int) - def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( "image", (ImageEmbeddingItems, ImageProcessorItems)) @@ -1575,7 +1586,6 @@ def get_image_replacement_phi4mm(item_idx: int): num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, - processor=hf_processor, ) image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens From 373d0a82aeedda788411857827e10f2b896c103d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 27 Mar 2025 15:37:08 +0800 Subject: [PATCH 04/36] image work Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8588750e144a..fc9770de3320 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1547,6 +1547,12 @@ def _call_hf_processor( ) -> BatchFeature: if mm_data: processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) + num_img_tokens = [ + self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1]) + for img_size in processed_outputs["image_sizes"] + ] + processed_outputs["num_img_tokens"] = num_img_tokens + processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds') else: tokenizer = self.info.get_tokenizer() processed_outputs = tokenizer(prompt, @@ -1561,8 +1567,9 @@ def _get_mm_fields_config( ) -> Mapping[str, MultiModalFieldConfig]: return dict( pixel_values=MultiModalFieldConfig.batched("image"), + image_attention_mask=MultiModalFieldConfig.batched("image"), image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), + num_img_tokens=MultiModalFieldConfig.batched("image"), ) def _get_prompt_updates( From a3f972596e0f185d1fab4216fbe7fdd85118f7e6 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 27 Mar 2025 23:46:21 +0800 Subject: [PATCH 05/36] fix multi images Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index fc9770de3320..02b03fb1db89 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1445,6 +1445,11 @@ def image_tokens(self) -> list[str]: @property def audio_tokens(self) -> list[str]: return [f"<|audio_{i+1}|>" for i in range(100)] + + @property + def dynamic_hd(self) -> int: + image_processor = self.get_hf_processor().image_processor + return image_processor.dynamic_hd def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -1478,11 +1483,12 @@ def get_num_image_tokens( if vision_encoder_name is None: vision_encoder_name = SIGLIP_NAME prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] vit_image_size = prepro_config['vit_image_size'] vit_patch_size = prepro_config['vit_patch_size'] token_compression_factor = prepro_config['token_compression_factor'] + dynamic_hd_size = self.dynamic_hd + image_num_tokens = _compute_num_image_tokens( image_width, image_height, dynamic_hd_size=dynamic_hd_size, @@ -1499,10 +1505,9 @@ def get_image_size_with_most_features(self) -> ImageSize: if vision_encoder_name is None: vision_encoder_name = SIGLIP_NAME prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] vit_image_size = prepro_config['vit_image_size'] - max_side = vit_image_size * dynamic_hd_size + max_side = vit_image_size * self.dynamic_hd return ImageSize(height=max_side, width=vit_image_size) @@ -1578,7 +1583,6 @@ def _get_prompt_updates( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore From 83ce87c2fb97a6998a696b531a8d9147443b3303 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 28 Mar 2025 00:10:48 +0800 Subject: [PATCH 06/36] init v1 Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 64 ++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 02b03fb1db89..d1df03f2d774 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -41,9 +41,9 @@ from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config from .idefics2_vision_model import Idefics2VisionTransformer -from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only +from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings from .phi4mm_audio import AudioEmbedding -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -1649,8 +1649,7 @@ def get_audio_replacement_phi4mm(item_idx: int): info=Phi4MMProcessingInfo, dummy_inputs=Phi4MMDummyInputsBuilder, ) -class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal, - SupportsV0Only): +class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): """ Implements the Phi-4-multimodal-instruct model in vLLM. """ @@ -1930,6 +1929,63 @@ def merge_image_features_to_inputs_embeds( ) return merged_embeds + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("audio_features", + "audio_embeds") and "audios" not in modalities: + modalities["audios"] = self._parse_and_validate_audio_input( + **kwargs) + + return modalities + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + audio_projection_mode = 'speech' + for modality in modalities: + # make sure process images first + if modality == "images": + audio_projection_mode = "vision" + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + # if modality == "audios": + # audio_input = modalities["audios"] + # audio_embeddings = self._process_audio_input(audio_input) + # multimodal_embeddings += audio_embeddings + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.embed_tokens(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, From 20fa915b1bd7682a22deb2288e647946067bb45a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 28 Mar 2025 01:07:19 +0800 Subject: [PATCH 07/36] v1 image work Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 150 ++++++++++++++++++--------- 1 file changed, 103 insertions(+), 47 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index d1df03f2d774..d509b5003131 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -22,7 +22,7 @@ from vllm.inputs.data import TokenInputs, token_inputs from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.models.llama import LlamaModel @@ -39,6 +39,7 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.utils import is_list_of from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings @@ -498,7 +499,7 @@ def get_img_features(self, def forward(self, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - image_attention_mask: torch.Tensor) -> torch.FloatTensor: + image_attention_mask: torch.Tensor) -> list[torch.FloatTensor]: """ process image and return vision embeddings. @@ -667,6 +668,40 @@ def forward(self, pixel_values: torch.FloatTensor, return img_set_tensor +class Phi4MMImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: + `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` + + Note that `num_patches` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ + + image_sizes: torch.Tensor + """ + Shape: `(batch_size * num_images, 2)` + + This should be in `(height, width)` format. + """ + + num_img_tokens: list[int] + """Shape: `(batch_size * num_images)`""" + + image_attention_mask: torch.Tensor + """Shape: `(batch_size * num_images, H_mask, W_mask)`""" + + +class Phi4MMImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ + + class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] data: Tuple[NestedTensors] @@ -679,6 +714,7 @@ class Phi4MMAudioEmbeddingInputs(TypedDict): """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)""" +Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs] Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs] @@ -1733,7 +1769,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() def _audio_features_to_embeddings( self, @@ -1848,7 +1884,7 @@ def _process_audio_input(self, input_ids: torch.Tensor, def _parse_and_validate_image_input(self, **kwargs: object) -> Optional[Dict]: - pixel_values: Optional[Dict] = kwargs.get("pixel_values") + pixel_values: NestedTensors = kwargs.get("pixel_values") if pixel_values is None: return None @@ -1858,8 +1894,8 @@ def _parse_and_validate_image_input(self, assert image_sizes is not None and image_attention_mask is not None\ and num_img_tokens is not None, "Missing image inputs" - if isinstance(pixel_values, list): - assert pixel_values[0].dim() == 5, "Incorrect image inputs" + if is_list_of(pixel_values, torch.Tensor): + assert all(p.dim() == 5 for p in pixel_values), "Incorrect image inputs" # list len is batch_size. # each tensor has dimension: num_img_per_example, num_hd_patches, # channels, height, width. @@ -1900,12 +1936,13 @@ def _parse_and_validate_image_input(self, else: raise ValueError("Incorrect image_attention_mask inputs") - return { - 'pixel_values': pixel_values, - 'image_sizes': image_sizes, - 'image_attention_mask': image_attention_mask, - 'num_img_tokens': num_img_tokens, - } + return Phi4MMImagePixelInputs( + type="pixel_values_videos", + data=pixel_values, + image_sizes=image_sizes, + image_attention_mask=image_attention_mask, + num_img_tokens=num_img_tokens, + ) def merge_image_features_to_inputs_embeds( self, @@ -1946,6 +1983,18 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: return modalities + def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]: + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + dtype = next(self.vision_encoder.parameters()).dtype + pixel_values = image_input['data'].to(dtype) + image_sizes = image_input['image_sizes'] + image_attention_mask = image_input['image_attention_mask'] + image_embeds = self.vision_encoder( + pixel_values, image_sizes, image_attention_mask) + return image_embeds + def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: @@ -1966,7 +2015,7 @@ def get_multimodal_embeddings( audio_projection_mode = "vision" image_input = modalities["images"] vision_embeddings = self._process_image_input(image_input) - multimodal_embeddings += vision_embeddings + multimodal_embeddings += tuple(vision_embeddings) # if modality == "audios": # audio_input = modalities["audios"] # audio_embeddings = self._process_audio_input(audio_input) @@ -1985,52 +2034,59 @@ def get_input_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[Phi4MMImagePixelInputs] = None, + audio_input: Optional[Phi4MMAudioFeatureInputs] = None, + ) -> torch.Tensor: + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID, + ) + + # if audio_input is not None: + # audio_embeds = self._process_audio_input(audio_input) + # inputs_embeds = merge_multimodal_embeddings( + # input_ids, + # inputs_embeds, + # audio_embeds, + # placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, + # ) + return inputs_embeds def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> torch.Tensor: if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - # Each entry in this is a pair of audio_features and audio_embed - # lengths + + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + elif inputs_embeds is None: + image_input = self._parse_and_validate_image_input(**kwargs) audio_input = self._parse_and_validate_audio_input(**kwargs) - image_inputs = self._parse_and_validate_image_input(**kwargs) - - has_audio = audio_input is not None - has_image = image_inputs is not None - - if has_audio: - audio_projection_mode = 'vision' if has_image else 'speech' - inputs_embeds = self._process_audio_input( - input_ids, audio_input, audio_projection_mode) - - if has_image: - dtype = self.vision_encoder.img_processor.embeddings.\ - patch_embedding.weight.dtype - pixel_values = image_inputs['pixel_values'].to(dtype) - image_sizes = image_inputs['image_sizes'] - image_attention_mask = image_inputs['image_attention_mask'] - image_set_tensors = self.vision_encoder( - pixel_values, image_sizes, image_attention_mask) - if not has_audio: - inputs_embeds = self.model.embed_tokens(input_ids) - - inputs_embeds = self.merge_image_features_to_inputs_embeds( - input_ids, inputs_embeds, image_set_tensors) - - if has_image or has_audio: - # multi-modal input, we have set inputs_embeds properly in - # previous steps - input_ids = None - else: - # text-only, we keep using original input_ids + + if image_input is None and audio_input is None: inputs_embeds = None + else: + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + audio_input=audio_input) + input_ids = None hidden_states = self.model( input_ids, From 6feca07a9dd0fdb2a9c452d15625d496de1bb608 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 00:55:02 +0800 Subject: [PATCH 08/36] make audio run Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 85 ++++++++++++++-------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index d509b5003131..d4c06f98174c 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -12,7 +12,7 @@ import torch.nn as nn import torchvision.transforms as T from PIL import Image -from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature +from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature, SequenceFeatureExtractor from transformers.utils import logging from vllm.config import VllmConfig @@ -31,7 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors, MultiModalInputs) -from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataParser, ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, @@ -1486,9 +1486,12 @@ def audio_tokens(self) -> list[str]: def dynamic_hd(self) -> int: image_processor = self.get_hf_processor().image_processor return image_processor.dynamic_hd + + def get_feature_extractor(self) -> SequenceFeatureExtractor: + return self.get_hf_processor().audio_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + return {"audio": None, "image": None} def get_mm_max_tokens_per_item( self, @@ -1497,11 +1500,11 @@ def get_mm_max_tokens_per_item( ) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), - # "audio": self.get_max_audio_tokens(), + "audio": self.get_max_audio_tokens(), } def get_max_audio_tokens(self) -> int: - return 10000 + return 188 def get_max_image_tokens(self) -> int: target_width, target_height = self.get_image_size_with_most_features() @@ -1554,7 +1557,7 @@ def get_dummy_processor_inputs( seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - # num_audios = mm_counts.get("audio", 0) + num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) target_width, target_height = \ @@ -1565,21 +1568,25 @@ def get_dummy_processor_inputs( self._get_dummy_images(width=target_width, height=target_height, num_images=num_images), - # "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - # num_audios=num_audios), + "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, + num_audios=num_audios), } image_tokens: list[str] = self.info.image_tokens[:num_images] - # audio_tokens: list[str] = self.info.audio_tokens[:num_audios] + audio_tokens: list[str] = self.info.audio_tokens[:num_audios] return ProcessorInputs( - prompt_text="".join(image_tokens), + prompt_text="".join(image_tokens+audio_tokens), mm_data=mm_data, ) class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): + def _get_data_parser(self) -> MultiModalDataParser: + feature_extractor = self.info.get_feature_extractor() + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) + def _call_hf_processor( self, prompt: str, @@ -1587,6 +1594,9 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: + if "audios" in mm_data: + sr = self.info.get_feature_extractor().sampling_rate + mm_data['audios'] = [(data, sr) for data in mm_data['audios']] processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) num_img_tokens = [ self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1]) @@ -1611,6 +1621,9 @@ def _get_mm_fields_config( image_attention_mask=MultiModalFieldConfig.batched("image"), image_sizes=MultiModalFieldConfig.batched("image"), num_img_tokens=MultiModalFieldConfig.batched("image"), + input_audio_embeds=MultiModalFieldConfig.batched("audio"), + audio_embed_sizes=MultiModalFieldConfig.batched("audio"), + audio_attention_mask=MultiModalFieldConfig.batched("audio"), ) def _get_prompt_updates( @@ -1662,7 +1675,7 @@ def get_audio_replacement_phi4mm(item_idx: int): ] audio_repl = [ PromptReplacement( - modality="image", + modality="audio", target=audio_token, replacement=get_audio_replacement_phi4mm, ) for audio_token in audio_tokens[:num_audios] @@ -1872,15 +1885,21 @@ def _process_audio_input(self, input_ids: torch.Tensor, # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) audio_feature = [i[0] for j in audio_features for i in j] - audio_feature_len = [i[1].item() for j in audio_features for i in j] + # audio_feature_len = [i[1].item() for j in audio_features for i in j] # Add the batch dim via `squeeze` - return self._audio_features_to_embeddings( - input_ids.unsqueeze(0), - audio_feature, - audio_feature_len, - audio_projection_mode, - ).squeeze(0) + # return self._audio_features_to_embeddings( + # input_ids.unsqueeze(0), + # audio_feature, + # audio_feature_len, + # audio_projection_mode, + # ).squeeze(0) + audio_set_tensor = [ + self.embed_tokens_extend.get_audio_features( + audio_feature, audio_projection_mode=audio_projection_mode) + for audio_feature in audio_feature + ] + return audio_set_tensor def _parse_and_validate_image_input(self, **kwargs: object) -> Optional[Dict]: @@ -1944,28 +1963,6 @@ def _parse_and_validate_image_input(self, num_img_tokens=num_img_tokens, ) - def merge_image_features_to_inputs_embeds( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - image_set_tensors: List[torch.Tensor], - ): - position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero( - as_tuple=True) - - assert all([t.shape[0] == 1 for t in image_set_tensors - ]), 'img_set_tensor should have shape (1, N_tokens, C)' - # Shape: (merged_N_tokens, C) - image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0) - image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to( - inputs_embeds.device) - merged_embeds = inputs_embeds.index_put( - indices=position_tuple, - values=image_set_tensor, - accumulate=False, - ) - return merged_embeds - def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: modalities = {} @@ -2016,10 +2013,10 @@ def get_multimodal_embeddings( image_input = modalities["images"] vision_embeddings = self._process_image_input(image_input) multimodal_embeddings += tuple(vision_embeddings) - # if modality == "audios": - # audio_input = modalities["audios"] - # audio_embeddings = self._process_audio_input(audio_input) - # multimodal_embeddings += audio_embeddings + if modality == "audios": + audio_input = modalities["audios"] + audio_embeddings = self._process_audio_input(audio_input) + multimodal_embeddings += audio_embeddings return multimodal_embeddings From 70478c8815af5ef38f604d15a06cff11f52fb878 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 15:21:41 +0800 Subject: [PATCH 09/36] fix Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index d4c06f98174c..a9770debcdf5 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1838,7 +1838,7 @@ def _parse_and_validate_audio_input( Returns: Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs. """ - audio_features = kwargs.pop("audio_features", None) + audio_features = kwargs.pop("input_audio_embeds", None) audio_embeds = kwargs.pop("audio_embeds", None) if audio_features is None and audio_embeds is None: @@ -1862,7 +1862,7 @@ def _parse_and_validate_audio_input( raise AssertionError("This line should be unreachable.") - def _process_audio_input(self, input_ids: torch.Tensor, + def _process_audio_input(self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str) -> NestedTensors: """ @@ -2015,7 +2015,7 @@ def get_multimodal_embeddings( multimodal_embeddings += tuple(vision_embeddings) if modality == "audios": audio_input = modalities["audios"] - audio_embeddings = self._process_audio_input(audio_input) + audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode) multimodal_embeddings += audio_embeddings return multimodal_embeddings @@ -2038,6 +2038,7 @@ def get_input_embeddings_v0( image_input: Optional[Phi4MMImagePixelInputs] = None, audio_input: Optional[Phi4MMAudioFeatureInputs] = None, ) -> torch.Tensor: + audio_projection_mode = 'speech' inputs_embeds = self.get_input_embeddings(input_ids) if image_input is not None: image_embeds = self._process_image_input(image_input) @@ -2047,15 +2048,16 @@ def get_input_embeddings_v0( image_embeds, placeholder_token_id=_IMAGE_PLACEHOLDER_TOKEN_ID, ) + audio_projection_mode = 'vision' - # if audio_input is not None: - # audio_embeds = self._process_audio_input(audio_input) - # inputs_embeds = merge_multimodal_embeddings( - # input_ids, - # inputs_embeds, - # audio_embeds, - # placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, - # ) + if audio_input is not None: + audio_embeds = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + audio_embeds, + placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN_ID, + ) return inputs_embeds def forward( From fbe07ff775091736422097c716b0c6780aac0924 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 16:30:24 +0800 Subject: [PATCH 10/36] fix audio correctness Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index a9770debcdf5..513bb959d8a4 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1871,8 +1871,6 @@ def _process_audio_input(self, created by `input_mapper_for_phi4mm_audio`. Args: - input_ids (torch.Tensor): Input IDs (the prompt in this case, - before the audio token replication). audio_input (Phi4MMAudioInputs): Audio input. Returns: @@ -1884,20 +1882,11 @@ def _process_audio_input(self, audio_features = audio_input["data"] # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) - audio_feature = [i[0] for j in audio_features for i in j] - # audio_feature_len = [i[1].item() for j in audio_features for i in j] - # Add the batch dim via `squeeze` - - # return self._audio_features_to_embeddings( - # input_ids.unsqueeze(0), - # audio_feature, - # audio_feature_len, - # audio_projection_mode, - # ).squeeze(0) - audio_set_tensor = [ - self.embed_tokens_extend.get_audio_features( - audio_feature, audio_projection_mode=audio_projection_mode) - for audio_feature in audio_feature + + dtype = next(self.embed_tokens_extend.parameters()).dtype + audio_set_tensor = [self.embed_tokens_extend.get_audio_features( + feature.to(dtype), audio_projection_mode=audio_projection_mode) + for feature in audio_features ] return audio_set_tensor From 49fb233fab3f0fe2130a2b666692b3ebdfef6216 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 19:51:59 +0800 Subject: [PATCH 11/36] fix multi audios Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 34 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 513bb959d8a4..240422137bf4 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -44,7 +44,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings from .phi4mm_audio import AudioEmbedding -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings, flatten_bn # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -705,7 +705,10 @@ class Phi4MMImageEmbeddingInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] data: Tuple[NestedTensors] - """Shape: `((batch_size, num_audios, 80, M), )""" + """Shape: `((batch_size * num_audios, 80, M), )""" + + audio_embed_sizes: torch.Tensor + """Shape: `(batch_size * num_audios)`""" class Phi4MMAudioEmbeddingInputs(TypedDict): @@ -1839,18 +1842,28 @@ def _parse_and_validate_audio_input( Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs. """ audio_features = kwargs.pop("input_audio_embeds", None) + audio_embed_sizes = kwargs.pop("audio_embed_sizes", None) audio_embeds = kwargs.pop("audio_embeds", None) if audio_features is None and audio_embeds is None: return None if audio_features is not None: - if not isinstance(audio_features, (torch.Tensor, list)): + assert isinstance(audio_embed_sizes, torch.Tensor) + if isinstance(audio_features, torch.Tensor): + assert audio_features.size(0) == len(audio_embed_sizes), ( + "audio_features and audio_embed_sizes must have the same length") + elif is_list_of(audio_features, list): + assert len(audio_features) == len(audio_embed_sizes), ( + "audio_features and audio_embed_sizes must have the same length") + else: raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") + return Phi4MMAudioFeatureInputs(type="audio_features", - data=audio_features) + data=flatten_bn(audio_features, concat=True), + audio_embed_sizes=flatten_bn(audio_embed_sizes, concat=True)) if audio_embeds is not None: if not isinstance(audio_embeds, (torch.Tensor, list)): @@ -1880,15 +1893,18 @@ def _process_audio_input(self, return audio_input["data"] audio_features = audio_input["data"] + audio_sizes = audio_input["audio_embed_sizes"] # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) dtype = next(self.embed_tokens_extend.parameters()).dtype - audio_set_tensor = [self.embed_tokens_extend.get_audio_features( - feature.to(dtype), audio_projection_mode=audio_projection_mode) - for feature in audio_features - ] - return audio_set_tensor + audio_padded_embeds = self.embed_tokens_extend.get_audio_features( + audio_features.to(dtype), + audio_projection_mode=audio_projection_mode, + ) + audio_embeds = [audio_padded_embeds[idx, :size] + for idx, size in enumerate(audio_sizes)] + return audio_embeds def _parse_and_validate_image_input(self, **kwargs: object) -> Optional[Dict]: From 51dde9c4b6d8b11cc7b271342b36a0db40f8703f Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 22:37:52 +0800 Subject: [PATCH 12/36] fix resampling Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 16 +++++++++++++++- vllm/multimodal/parse.py | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 240422137bf4..495d7738f22c 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -7,6 +7,7 @@ TypedDict, Union) import numpy as np +import numpy.typing as npt import scipy.signal import torch import torch.nn as nn @@ -1587,8 +1588,21 @@ def get_dummy_processor_inputs( class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: + + def scipy_resample_audio( + audio: npt.NDArray[np.floating], + *, + orig_sr: float, + target_sr: float, + ): + if orig_sr > target_sr: + return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr) + elif orig_sr < target_sr: + return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1) + return audio + feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, resample_func=scipy_resample_audio) def _call_hf_processor( self, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 772b1609a9fb..8d723b6bef8f 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -308,10 +308,11 @@ class MultiModalDataParser: items to the model's expected sampling rate. """ - def __init__(self, *, target_sr: Optional[float] = None) -> None: + def __init__(self, *, target_sr: Optional[float] = None, resample_func: Optional[Callable] = None,) -> None: super().__init__() self.target_sr = target_sr + self.audio_resampler = resample_audio if resample_func is None else resample_func def _is_embeddings( self, data: object From f0715817fb917ab57cd4f785ec947de338876166 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 30 Mar 2025 23:01:43 +0800 Subject: [PATCH 13/36] fix resampling Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/multimodal/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 8d723b6bef8f..3f30ec646af0 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -368,7 +368,7 @@ def _parse_audio_data( "Audio resampling is not supported when " "`target_sr` is not provided") - new_audio = resample_audio(audio, + new_audio = self.audio_resampler(audio, orig_sr=orig_sr, target_sr=target_sr) From d665855affd73aad2832757d4ad007b50b519287 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 31 Mar 2025 01:48:39 +0800 Subject: [PATCH 14/36] fix audio diff Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 495d7738f22c..a906247ad9f9 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -708,9 +708,6 @@ class Phi4MMAudioFeatureInputs(TypedDict): data: Tuple[NestedTensors] """Shape: `((batch_size * num_audios, 80, M), )""" - audio_embed_sizes: torch.Tensor - """Shape: `(batch_size * num_audios)`""" - class Phi4MMAudioEmbeddingInputs(TypedDict): type: Literal["audio_embeds"] @@ -1621,6 +1618,10 @@ def _call_hf_processor( ] processed_outputs["num_img_tokens"] = num_img_tokens processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds') + if "audios" in mm_data: + feature_sizes = [size.item() * 8 for size in processed_outputs['audio_embed_sizes']] + audio_features = processed_outputs['input_audio_embeds'] + processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] else: tokenizer = self.info.get_tokenizer() processed_outputs = tokenizer(prompt, @@ -1874,10 +1875,8 @@ def _parse_and_validate_audio_input( raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") - return Phi4MMAudioFeatureInputs(type="audio_features", - data=flatten_bn(audio_features, concat=True), - audio_embed_sizes=flatten_bn(audio_embed_sizes, concat=True)) + data=flatten_bn(audio_features)) if audio_embeds is not None: if not isinstance(audio_embeds, (torch.Tensor, list)): @@ -1907,17 +1906,14 @@ def _process_audio_input(self, return audio_input["data"] audio_features = audio_input["data"] - audio_sizes = audio_input["audio_embed_sizes"] # (e.g. multiple examples) and the second dim is the multi-audio dim # (e.g. multiple audios in the same example) dtype = next(self.embed_tokens_extend.parameters()).dtype - audio_padded_embeds = self.embed_tokens_extend.get_audio_features( - audio_features.to(dtype), + audio_embeds = [self.embed_tokens_extend.get_audio_features( + features.unsqueeze(0).to(dtype), audio_projection_mode=audio_projection_mode, - ) - audio_embeds = [audio_padded_embeds[idx, :size] - for idx, size in enumerate(audio_sizes)] + ).squeeze(0) for features in audio_features] return audio_embeds def _parse_and_validate_image_input(self, From f63d7c285fb9cde1e29e75a435dda8b90b244c62 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 2 Apr 2025 00:35:33 +0800 Subject: [PATCH 15/36] unpad audio features Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index a906247ad9f9..98f2aedbee7a 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1549,6 +1549,19 @@ def get_image_size_with_most_features(self) -> ImageSize: max_side = vit_image_size * self.dynamic_hd return ImageSize(height=max_side, width=vit_image_size) + + def get_audio_feature_nums(self, audio_len: int, sr: float): + if sr >= 16000: + win_length = 400 + hop_length = 160 + elif 8000 <= sr < 16000: + win_length = 200 + hop_length = 80 + else: + raise RuntimeError(f"Input data using an unsupported sample rate: {sr}") + + # Spec 1: SpeechLib cut remaining sample insufficient for a hop + return (audio_len - win_length) // hop_length + 1 class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): @@ -1619,8 +1632,8 @@ def _call_hf_processor( processed_outputs["num_img_tokens"] = num_img_tokens processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds') if "audios" in mm_data: - feature_sizes = [size.item() * 8 for size in processed_outputs['audio_embed_sizes']] audio_features = processed_outputs['input_audio_embeds'] + feature_sizes = [self.info.get_audio_feature_nums(len(audio), sr) for audio, sr in mm_data['audios']] processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] else: tokenizer = self.info.get_tokenizer() From 83c08fca038d38491e97e6128b1b3642ec930fd6 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 2 Apr 2025 00:59:10 +0800 Subject: [PATCH 16/36] fix v1 audio Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 98f2aedbee7a..7b2651b626b0 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1881,6 +1881,9 @@ def _parse_and_validate_audio_input( if isinstance(audio_features, torch.Tensor): assert audio_features.size(0) == len(audio_embed_sizes), ( "audio_features and audio_embed_sizes must have the same length") + elif is_list_of(audio_features, torch.Tensor): + assert len(audio_features) == len(audio_embed_sizes), ( + "audio_features and audio_embed_sizes must have the same length") elif is_list_of(audio_features, list): assert len(audio_features) == len(audio_embed_sizes), ( "audio_features and audio_embed_sizes must have the same length") @@ -2001,7 +2004,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) - if input_key in ("audio_features", + if input_key in ("input_audio_embeds", "audio_embeds") and "audios" not in modalities: modalities["audios"] = self._parse_and_validate_audio_input( **kwargs) @@ -2044,7 +2047,7 @@ def get_multimodal_embeddings( if modality == "audios": audio_input = modalities["audios"] audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode) - multimodal_embeddings += audio_embeddings + multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings From 1b9f027477373fb20dcc014cfc9f94638c256587 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 2 Apr 2025 01:11:39 +0800 Subject: [PATCH 17/36] clean legacy code Signed-off-by: Isotr0py <2037008807@qq.com> --- .../vision_language_multi_image.py | 2 +- vllm/model_executor/models/phi4mm.py | 795 ------------------ 2 files changed, 1 insertion(+), 796 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 98a739169d70..6736d7d72299 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -335,7 +335,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=10000, + max_model_len=12800, max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, enable_lora=True, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 7b2651b626b0..a4b95c743782 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -79,41 +79,6 @@ } -def get_max_dummy_image(ctx: InputContext): - hf_config = ctx.get_hf_config() - vision_encoder_name = hf_config.img_processor - if vision_encoder_name is None: - vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] - vit_image_size = prepro_config['vit_image_size'] - - max_side = vit_image_size * dynamic_hd_size - dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side) - return dummy_image - - -# image token length -def get_max_phi4mm_image_tokens(ctx: InputContext): - dummy_image = get_max_dummy_image(ctx) - - hf_config = ctx.get_hf_config() - vision_encoder_name = hf_config.img_processor - if vision_encoder_name is None: - vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] - vit_image_size = prepro_config['vit_image_size'] - vit_patch_size = prepro_config['vit_patch_size'] - token_compression_factor = prepro_config['token_compression_factor'] - - image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size, - vit_image_size, - vit_patch_size, - token_compression_factor) - return image_num_tokens - - def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): best_ratio_diff = float('inf') @@ -172,181 +137,6 @@ def _get_padding_size(orig_width: int, orig_height: int, target_height: int, tar return padding_height, padding_width -def dynamic_preprocess(image, - min_num=1, - max_num=12, - image_size=384, - mask_size=27): - orig_width, orig_height = image.size - target_aspect_ratio, target_height, target_width =\ - _find_target_aspect_ratio( - orig_width, orig_height, image_size, max_num, min_num) - padding_height, padding_width = _get_padding_size(image, target_height, - target_width) - - # Calculate the ratio - ratio_width = target_width / orig_width - ratio_height = target_height / orig_height - if ratio_width < ratio_height: - new_size = (target_width, int(orig_height * ratio_width)) - else: - new_size = (int(orig_width * ratio_height), target_height) - - attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]), - int(mask_size * target_aspect_ratio[0]))) - if padding_width >= 14: - attention_mask[:, -math.floor(padding_width / 14):] = 0 - if padding_height >= 14: - attention_mask[-math.floor(padding_height / 14):, :] = 0 - assert attention_mask.sum( - ) > 0, f'attention mask is empty {attention_mask}' - - if min(new_size[1], target_height) < 10 or min(new_size[0], - target_width) < 10: - raise ValueError(f'the aspect ratio is very extreme {new_size}') - - image = T.functional.resize( - image, - [new_size[1], new_size[0]], - ) - - resized_img = T.functional.pad(image, - [0, 0, padding_width, padding_height], - fill=[255, 255, 255]) - - return resized_img, attention_mask - - -def pad_to_max_num_crops(images, max_crops=5): - """ - images: B x 3 x H x W, B<=max_crops - """ - B, _, H, W = images.shape - if max_crops > B: - pad = torch.zeros(max_crops - B, - 3, - H, - W, - dtype=images.dtype, - device=images.device) - images = torch.cat([images, pad], dim=0) - return images - - -def pad_mask_to_max_num_crops(masks, max_crops=5): - B, H, W = masks.shape - if max_crops > B: - pad = torch.ones(max_crops - B, - H, - W, - dtype=masks.dtype, - device=masks.device) - masks = torch.cat([masks, pad], dim=0) - return masks - - -def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size): - - # Basic settings. - img_processor = T.Compose([ - T.ToTensor(), - T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), - ]) - # Dynamic HD - base_resolution = vit_resolution - images = [image.convert('RGB') for image in images] - # cover 384 and 448 resolution - mask_resolution = base_resolution // vit_patch_size - elems, image_attention_masks = [], [] - for im in images: - elem, attention_mask = dynamic_preprocess(im, - max_num=dynamic_hd_size, - image_size=base_resolution, - mask_size=mask_resolution) - elems.append(elem) - image_attention_masks.append(attention_mask) - hd_images = [img_processor(im) for im in elems] - global_image = [ - torch.nn.functional.interpolate( - im.unsqueeze(0).float(), - size=(base_resolution, base_resolution), - mode='bicubic', - ).to(im.dtype) for im in hd_images - ] - shapes = [[im.size(1), im.size(2)] for im in hd_images] - mask_shapes = [[mask.size(0), mask.size(1)] - for mask in image_attention_masks] - global_attention_mask = [ - torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images - ] - hd_images_reshape = [ - im.reshape(1, 3, h // base_resolution, base_resolution, - w // base_resolution, base_resolution).permute( - 0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution, - base_resolution).contiguous() - for im, (h, w) in zip(hd_images, shapes) - ] - attention_masks_reshape = [ - mask.reshape(1, h // mask_resolution, mask_resolution, - w // mask_resolution, mask_resolution).permute( - 0, 1, 3, 2, 4).reshape(-1, mask_resolution, - mask_resolution).contiguous() - for mask, (h, w) in zip(image_attention_masks, mask_shapes) - ] - # NOTE token compression is hard coded here, and odd numbers seems to fail - downsample_attention_masks = [ - mask[:, 0::2, - 0::2].reshape(1, h // mask_resolution, w // mask_resolution, - mask_resolution // 2 + mask_resolution % 2, - mask_resolution // 2 + mask_resolution % 2).permute( - 0, 1, 3, 2, 4) - for mask, (h, w) in zip(attention_masks_reshape, mask_shapes) - ] - downsample_attention_masks = [ - mask.reshape(mask.size(1) * mask.size(2), - mask.size(3) * mask.size(4)) - for mask in downsample_attention_masks - ] - # NOTE hard coded number of tokens - num_img_tokens = [ - 256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16 - for mask in downsample_attention_masks - ] - - hd_images_reshape = [ - torch.cat([_global_image] + [_im], dim=0) - for _global_image, _im in zip(global_image, hd_images_reshape) - ] - hd_masks_reshape = [ - torch.cat([_global_mask] + [_mask], - dim=0) for _global_mask, _mask in zip( - global_attention_mask, attention_masks_reshape) - ] - max_crops = max([img.size(0) for img in hd_images_reshape]) - image_transformed = [ - pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape - ] - image_transformed = torch.stack(image_transformed, dim=0) - mask_transformed = [ - pad_mask_to_max_num_crops(mask, max_crops) \ - for mask in hd_masks_reshape - ] - mask_transformed = torch.stack(mask_transformed, dim=0) - - returned_input_image_embeds = image_transformed - returned_image_sizes = torch.tensor(shapes, dtype=torch.long) - returned_image_attention_mask = mask_transformed - returned_num_img_tokens = num_img_tokens - - data = { - "pixel_values": returned_input_image_embeds, - "image_sizes": returned_image_sizes, - "image_attention_mask": returned_image_attention_mask, - "num_img_tokens": returned_num_img_tokens, - } - return data - - def get_navit_vision_model(layer_idx: int = -1, **kwargs): vision_config = { "hidden_size": 1152, @@ -719,188 +509,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict): Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs] -def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None): - """Create a Mel filter-bank the same as SpeechLib FbankFC. - - Args: - sample_rate (int): Sample rate in Hz. number > 0 [scalar] - n_fft (int): FFT size. int > 0 [scalar] - n_mel (int): Mel filter size. int > 0 [scalar] - fmin (float): lowest frequency (in Hz). If None use 0.0. - float >= 0 [scalar] - fmax: highest frequency (in Hz). If None use sample_rate / 2. - float >= 0 [scalar] - - Returns - out (numpy.ndarray): Mel transform matrix - [shape=(n_mels, 1 + n_fft/2)] - """ - - bank_width = int(n_fft // 2 + 1) - if fmax is None: - fmax = sample_rate / 2 - if fmin is None: - fmin = 0 - assert fmin >= 0, "fmin cannot be negative" - assert (fmin < fmax <= - sample_rate / 2), "fmax must be between (fmin, samplerate / 2]" - - def mel(f): - return 1127.0 * np.log(1.0 + f / 700.0) - - def bin2mel(fft_bin): - return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0)) - - def f2bin(f): - return int((f * n_fft / sample_rate) + 0.5) - - # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1] - klo = f2bin(fmin) + 1 - khi = f2bin(fmax) - - khi = max(khi, klo) - - # Spec 2: SpeechLib uses triangles in Mel space - mlo = mel(fmin) - mhi = mel(fmax) - m_centers = np.linspace(mlo, mhi, n_mels + 2) - ms = (mhi - mlo) / (n_mels + 1) - - matrix = np.zeros((n_mels, bank_width), dtype=np.float32) - for m in range(0, n_mels): - left = m_centers[m] - center = m_centers[m + 1] - right = m_centers[m + 2] - for fft_bin in range(klo, khi): - mbin = bin2mel(fft_bin) - if left < mbin < right: - matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms - - return matrix - - -class LogFbankProcessor: - - def __init__(self): - - self._eightk_method = "fillzero" - self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T - - self._hamming400 = np.hamming(400) # for 16k audio - self._hamming200 = np.hamming(200) # for 8k audio - - def extract_spectrogram(self, wav, fs): - """Extract spectrogram features from waveform. - Args: - wav (1D array): waveform of the input - fs (int): sampling rate of the waveform, 16000 or 8000. - If fs=8000, the waveform will be resampled to 16000Hz. - Output: - log_fbank (2D array): a TxD matrix of log Mel filterbank features. - D=80, and T is the number of frames. - """ - if wav.ndim > 1: - wav = np.squeeze(wav) - - # by default, we extract the mean if stereo - if len(wav.shape) == 2: - wav = wav.mean(1) - - # Resample to 16000 or 8000 if needed - if fs > 16000: - wav = scipy.signal.resample_poly(wav, 1, fs // 16000) - fs = 16000 - elif 8000 < fs < 16000: - wav = scipy.signal.resample_poly(wav, 1, fs // 8000) - fs = 8000 - elif fs < 8000: - raise RuntimeError(f"Unsupported sample rate {fs}") - - if fs == 8000: - if self._eightk_method == "resample": - # Input audio is 8 kHz. Convert to 16 kHz before feature - # extraction - wav = scipy.signal.resample_poly(wav, 2, 1) - fs = 16000 - # Do nothing here for fillzero method - elif fs != 16000: - # Input audio is not a supported sample rate. - raise RuntimeError( - f"Input data using an unsupported sample rate: {fs}") - - preemphasis = 0.97 - - if fs == 8000: - n_fft = 256 - win_length = 200 - hop_length = 80 - fft_window = self._hamming200 - elif fs == 16000: - n_fft = 512 - win_length = 400 - hop_length = 160 - fft_window = self._hamming400 - - # Spec 1: SpeechLib cut remaining sample insufficient for a hop - n_batch = (wav.shape[0] - win_length) // hop_length + 1 - # Here we don't use stride_tricks since the input array may not satisfy - # memory layout requirement and we need writeable output - # Here we only use list of views before copy to destination - # so it is more efficient than broadcasting - y_frames = np.array( - [ - wav[_stride:_stride + win_length] - for _stride in range(0, hop_length * n_batch, hop_length) - ], - dtype=np.float32, - ) - - # Spec 2: SpeechLib applies preemphasis within each batch - y_frames_prev = np.roll(y_frames, 1, axis=1) - y_frames_prev[:, 0] = y_frames_prev[:, 1] - y_frames = (y_frames - preemphasis * y_frames_prev) * 32768 - - S = np.fft.rfft(fft_window * y_frames, n=n_fft, - axis=1).astype(np.complex64) - - if fs == 8000: - # Need to pad the output to look like 16 kHz data but with zeros in - # the 4 to 8 kHz bins. - frames, bins = S.shape - padarray = np.zeros((frames, bins)) - S = np.concatenate((S[:, 0:-1], padarray), - axis=1) # Nyquist bin gets set to zero - - spec = np.abs(S).astype(np.float32) - return spec - - def extract_features(self, wav, fs): - """Extract log filterbank features from waveform. - Args: - wav (1D array): waveform of the input - fs (int): sampling rate of the waveform, 16000 or 8000. - If fs=8000, the waveform will be resampled to 16000Hz. - Output: - log_fbank (2D array): a TxD matrix of log Mel filterbank features. - D=80, and T is the number of frames. - """ - spec = self.extract_spectrogram(wav, fs) - spec_power = spec**2 - - fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None) - log_fbank = np.log(fbank_power).astype(np.float32) - - return log_fbank - - -@lru_cache -def audio_feature_extractor() -> LogFbankProcessor: - # Creates an instance of the audio processor, needed to extract the - # the audio features from the sound file - # LRU cache ensures that we only make one copy - return LogFbankProcessor() - - def _compute_num_image_tokens( orig_width: int, orig_height: int, @@ -1019,255 +627,6 @@ def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]: return T, mel_bins -def _get_audio_embed_sizes(audios, ctx: InputContext): - """ - Get the audio embedding sizes for each audio file. - - Args: - audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of - waveform and sample rate. - ctx (InputContext): Input context. - - Returns: - List[int]: List of audio embedding sizes. - """ - audio_embed_sizes = [] - for audio in audios: - audio_data, sf = audio - audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf) - audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(), - audio_frames) - audio_embed_sizes.append(audio_embed_size) - return audio_embed_sizes - - -def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""): - """ - The following will search for `<|audio_{idx}|>` tokens and - return a mapping of audio placeholder tokens to audio placeholder token ids - based on the size of the audio embeddings. - - Args: - audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of - waveform and sample rate. - ctx (InputContext): Input context. - prompt_str (str): The prompt string. - - Returns: - Dict[str, List[int]]: Mapping of audio placeholder tokens to audio - placeholder token ids. - - """ - if len(audios) == 0: - return {} - - audio_embed_sizes = _get_audio_embed_sizes(audios, ctx) - audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str) - audio_ids = [int(audio_id) for audio_id in audio_ids] - assert len(audio_ids) == len( - audio_embed_sizes - ), "Number of audio tokens and audio features do not match" - assert tuple(audio_ids) == tuple(range(1, - len(audio_ids) + - 1)), "Audio ids are not in order!" - audio_id_to_input_ids = { - f"<|audio_{audio_id}|>": - [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size - for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes) - } - - return audio_id_to_input_ids - - -def _count_image_tokens(images, ctx: InputContext): - hf_config = ctx.get_hf_config() - vision_encoder_name = hf_config.img_processor - if vision_encoder_name is None: - vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] - vit_image_size = prepro_config['vit_image_size'] - vit_patch_size = prepro_config['vit_patch_size'] - token_compression_factor = prepro_config['token_compression_factor'] - - image_token_counts = [ - _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size, - vit_patch_size, token_compression_factor) - for image in images - ] - return image_token_counts - - -def _get_image_id_to_input_ids(images, prompt, ctx: InputContext): - if len(images) == 0: - return {} - - image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt) - image_ids = [int(image_id) for image_id in image_ids] - assert len(image_ids) == len( - set(image_ids)), "Duplicate image tokens in prompt" - assert len(images) == len( - image_ids), "Number of images and image tokens in prompt do not match" - - # NOTE the following assertion is not strictly necessary - assert tuple(image_ids) == tuple(range(1, - len(image_ids) + - 1)), "Image ids are not in order" - - image_token_counts = _count_image_tokens(images, ctx) - image_id_to_input_ids = { - f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens - for image_id, num_tokens in zip(image_ids, image_token_counts) - } - return image_id_to_input_ids - - -def input_processor_for_phi4mm(ctx: InputContext, - inputs: DecoderOnlyInputs) -> TokenInputs: - """ - Implements the input processor, which transforms the input prompt ids - to include the audio placeholder token. This will become the `input_ids` - in `forward` for the model. - - Args: - ctx (InputContext): Input context. - inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids) - to process. - - Returns: - TokenInputs: Processed inputs - """ - multi_modal_data = inputs.get("multi_modal_data") - if (multi_modal_data is None or - ("audio" not in multi_modal_data and "image" not in multi_modal_data)): - # pure text input, so no need to do pre-processing - return inputs - - prompt_str = inputs.get("prompt") - prompt_token_ids = inputs.get("prompt_token_ids") - # for offline_inference, we will get str input and we parse MM special - # tokens from it - # (ignore prompt_token_ids) - # for OAI server, we will get prompt_token_ids, where MM special tokens - # are already parsed - - if 'audio' in multi_modal_data: - audios = multi_modal_data["audio"] - - if not isinstance(audios, list): - audios = [audios] - if prompt_str is not None: - audio_id_to_input_ids = _get_audio_id_to_input_ids( - audios, ctx, prompt_str=prompt_str) - audio_embed_sizes = [] - elif prompt_token_ids is not None: - audio_id_to_input_ids = {} - audio_embed_sizes = _get_audio_embed_sizes(audios, ctx) - else: - audio_id_to_input_ids = {} - audio_embed_sizes = [] - - if 'image' in multi_modal_data: - # PIL Image or list of PIL Images - images = multi_modal_data["image"] - if not isinstance(images, list): - images = [images] - if prompt_str is not None: - image_id_to_input_ids = _get_image_id_to_input_ids( - images, prompt_str, ctx) - image_token_counts = [] - elif prompt_token_ids is not None: - image_id_to_input_ids = {} - image_token_counts = _count_image_tokens(images, ctx) - else: - image_id_to_input_ids = {} - image_token_counts = [] - - # Handle the case where the prompt is a string and we need to manually - # tokenize it. - # In this case, the `audio_id_to_input_ids` dict will be mapping from - # an audio placeholder - # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the - # given audio length. - if prompt_str: - pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)" - prompt_chunk_strings = re.split(pattern, prompt_str) - prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""] - - # Create the new input_ids with the placeholder image and audio - # tokens inserted - tokenizer = cached_tokenizer_from_config(ctx.model_config) - input_ids = [] - has_imag, has_audio, has_user_text_input = False, False, False - for prompt_chunk_string in prompt_chunk_strings: - if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string): - input_ids.extend(image_id_to_input_ids[prompt_chunk_string]) - has_imag = True - elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string): - input_ids.extend(audio_id_to_input_ids[prompt_chunk_string]) - has_audio = True - else: - curr_token_ids = tokenizer(prompt_chunk_string).input_ids - if not has_user_text_input: - for token_id in curr_token_ids: - if token_id not in NON_USER_INPUT_TOKENS: - has_user_text_input = True - break - input_ids.extend(curr_token_ids) - if has_audio and has_imag and has_user_text_input: - raise ValueError( - "Phi4MMForCausalLM does not support text + audio + image" + - " inputs in the same prompt") - # Handle the case where the prompt is already tokenized - else: - assert prompt_token_ids is not None, \ - "If string prompt isn't provided, prompt_token_ids must be" - - i = 0 - input_ids = prompt_token_ids - # only needed for later assertion - img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0 - image_token_count_iter = iter(image_token_counts) - audio_embed_size_iter = iter(audio_embed_sizes) - while i < len(input_ids): - token_id = input_ids[i] - if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID: - token_count = next(audio_embed_size_iter) - audio_cnt += 1 - elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID: - token_count = next(image_token_count_iter) - img_cnt += 1 - else: - user_text_input_cnt += 1 if token_id not in \ - NON_USER_INPUT_TOKENS else 0 - i += 1 - continue - tokens = [token_id] * token_count - input_ids = input_ids[:i] + tokens + input_ids[i + 1:] - i += token_count - - if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0: - raise ValueError( - "Phi4MMForCausalLM does not support text + audio + image" + - " inputs in the same prompt") - # If the below assertion fails, it might be that input pure-text - # messages contain image/audio special tokens literally - # (<|endoftext10|>, <|endoftext11|>). - assert (img_cnt == len(image_token_counts)), ( - f"Number of image tokens in prompt_token_ids ({img_cnt}) " - f"does not match number of images ({len(image_token_counts)})") - assert (audio_cnt == len(audio_embed_sizes)), ( - f"Number of audio tokens in prompt_token_ids ({audio_cnt}) " - f"does not match number of audios ({len(audio_embed_sizes)})") - - # NOTE: Create a defensive copy of the original inputs - return token_inputs( - prompt_token_ids=input_ids, - prompt=prompt_str, - multi_modal_data=multi_modal_data, - ) - - def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int: """ Compute the audio embedding size based on the audio frames and @@ -1293,160 +652,6 @@ def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int: return 10000 -def dummy_audio_for_phi4mm(audio_count: int) -> dict: - """ - Create dummy audio data for the Phi4MM model, which is used for profiling. - - Args: - audio_count (int): Number of audio samples. - - Returns: - dict: Dummy audio data. - """ - dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0) - return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count - - -def dummy_image_for_phi4mm(width: int, height: int): - image = Image.new('RGB', (width, height), color='black') - return image - - -def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]) -> DummyData: - """ - Create dummy sequence (input_ids) and audio data for the Phi4MM model, - which is used for profiling. - - In this case, the sequence data is a bunch of 0s with a number of audio - tokens that correspond to the audio embed size of the - _AUDIO_MAX_SOUNDFILE_SIZE. - - Args: - ctx (InputContext): Input context. - seq_len (int): Length of the sequence. - mm_counts (Mapping[str, int]): Multi-modal counts. - - Returns: - Tuple: Dummy sequence data and dummy audio data. - """ - audio_count = mm_counts["audio"] - audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE, - DUMMY_SAMPLING_FREQUENCY) - audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(), - audio_frames) - - image_count = mm_counts["image"] - dummy_image = get_max_dummy_image(ctx) - max_image_tokens = get_max_phi4mm_image_tokens(ctx) - total_image_tokens = image_count * max_image_tokens - - if seq_len - audio_feature_size * audio_count - total_image_tokens < 0: - raise RuntimeError( - f"Phi4MM cannot process {audio_count} audios and {image_count}" - f"images in a prompt, please increase max_model_len to be at" - f" larger than " - f"{audio_feature_size * audio_count + total_image_tokens}" - " or reduce audio/image limit by --limit-mm-per-prompt.") - - if audio_feature_size * audio_count > total_image_tokens: - seq_data = SequenceData.from_prompt_token_counts( - (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count), - (0, seq_len - audio_feature_size * audio_count), - ) - mm_data = { - "audio": dummy_audio_for_phi4mm(audio_count), - } - else: - seq_data = SequenceData.from_prompt_token_counts( - (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens), - (0, seq_len - total_image_tokens), - ) - mm_data = { - "image": [dummy_image] * image_count, - } - return DummyData(seq_data, mm_data) - - -def input_mapper_for_phi4mm_audio(ctx: InputContext, - data: object) -> MultiModalInputs: - """ - This function is used to create the MultiModalInputs for the Phi4MM - (audio) model. - Specifically, for audio, we extract the audio features from the sound - file and create pairs of audio features and audio embed lengths (the - latter of which is used to repeat the audio placeholder token in the - input prompt IDs). - These pairs are used, downstream, in `_audio_features_to_embeddings` - (via `_process_audio_input`). - - Note that the incoming audio data (each entry in `data`) is a tuple of - the audio data and the sampling frequency (e.g. from soundfile.read). - - Args: - ctx (InputContext): Input context. - data (object): Audio data. - - Returns: - MultiModalInputs: Multi-modal inputs. - """ - if not isinstance(data, list): - data = [data] - - if len(data) == 0: - return MultiModalInputs() - - audio_features = [] - for audio_input in data: - if not isinstance(audio_input, tuple): - raise NotImplementedError( - f"Unsupported data type: {type(audio_input)}") - - audio, sf = audio_input - feature_extractor = audio_feature_extractor() - single_audio_features = feature_extractor.extract_features(audio, sf) - feat_stride = (1 if not hasattr(feature_extractor, "stride") else - feature_extractor.stride) - audio_frames = len(single_audio_features) * feat_stride - single_audio_embed_size = _compute_audio_embed_size( - ctx.get_hf_config(), audio_frames) - single_audio_feature_audio_len_pair = ( - single_audio_features, - [single_audio_embed_size], - ) - audio_features.append(single_audio_feature_audio_len_pair) - return MultiModalInputs({"audio_features": audio_features}) - - -def input_mapper_for_phi4mm_image(ctx: InputContext, data: object): - if not isinstance(data, list): - data = [data] - # data: list of PIL images - if len(data) == 0: - return MultiModalInputs() - hf_config = ctx.get_hf_config() - vision_encoder_name = hf_config.img_processor - if vision_encoder_name is None: - vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] - dynamic_hd_size = prepro_config['dynamic_hd'] - vit_image_size = prepro_config['vit_image_size'] - vit_patch_size = prepro_config['vit_patch_size'] - - image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size, - vit_patch_size) - return MultiModalInputs({ - "pixel_values": - image_input_dict["pixel_values"], - "image_sizes": - image_input_dict["image_sizes"], - "image_attention_mask": - image_input_dict["image_attention_mask"], - "num_img_tokens": - image_input_dict["num_img_tokens"], - }) - - def cat_with_pad(tensors, dim, padding_value=0): """ cat along dim, while pad to max for all other dims From 76f8b8e94b2458978110fc743ca2d8bfac11a967 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 2 Apr 2025 01:48:28 +0800 Subject: [PATCH 18/36] clean up Signed-off-by: Isotr0py <2037008807@qq.com> --- docs/source/models/supported_models.md | 2 +- vllm/model_executor/models/phi4mm.py | 99 +++++++++----------------- 2 files changed, 35 insertions(+), 66 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 1b742717885e..78c45db1025b 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -920,7 +920,7 @@ See [this page](#generative-models) for more information on how to use generativ * `microsoft/Phi-4-multimodal-instruct`, etc. * ✅︎ * - * + * ✅︎ - * `PixtralForConditionalGeneration` * Pixtral * T + I+ diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index a4b95c743782..8b145cb68cde 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -53,11 +53,6 @@ _AUDIO_PLACEHOLDER_TOKEN_ID = 200011 _AUDIO_MAX_SOUNDFILE_SIZE = 241_000 -DUMMY_SAMPLING_FREQUENCY = 16_000 # kHz - -DYNAMIC_HD = 16 -AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>" -IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>" SIGLIP_NAME = "siglip-so400m-patch14-448" VISION_ENCODER_TO_PROCESSING_CONFIG = { @@ -68,15 +63,6 @@ 'token_compression_factor': 2, }, } -logger = logging.get_logger(__name__) -# This is a workaround to prevent text (user input) + audio + image -# from being used in the same prompt. -# It includes token ids for "/n" and tokens in added_tokens_decoder -# from the tokenizer_confg.json file. -NON_USER_INPUT_TOKENS = { - 198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022, - 200023, 200024, 200025, 200026, 200027, 200028 -} def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, @@ -116,7 +102,6 @@ def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] - logger.debug("target_aspect_ratio: %s", target_aspect_ratio) else: target_width = image_size * w_crop_num target_height = image_size * h_crop_num @@ -588,45 +573,6 @@ def _compute_num_image_tokens( num_hd_newline_tokens + num_global_image_newline_tokens) -def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]: - """ - Compute the output size of the `extract_features` method. - - Args: - wav_length (int): Length of the input waveform in samples. - fs (int): Sampling rate of the waveform, either 16000 or 8000. - - Returns: - tuple (int, int): Output size as (T, D), where: - T: Number of time frames. - D: Number of Mel filterbank bins (80). - """ - - # Resample to 16000 or 8000 if needed - if fs > 16000: - wav_length //= fs // 16000 - fs = 16000 - elif 8000 <= fs < 16000: - # We'll resample to 16K from 8K - wav_length *= 2 - fs = 16000 - elif fs < 8000: - raise RuntimeError(f"Unsupported sample rate {fs}") - - # Spectrogram parameters for 16 kHz - win_length = 400 # Frame length in samples - hop_length = 160 # Frame shift in samples - mel_bins = 80 # Number of mel filterbank bins - - # Calculate number of frames (T) - T = (wav_length - win_length) // hop_length + 1 - if T < 1: - raise ValueError("Waveform too short for given parameters.") - - # Return time frames (T) and mel bins (D) - return T, mel_bins - - def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int: """ Compute the audio embedding size based on the audio frames and @@ -755,18 +701,40 @@ def get_image_size_with_most_features(self) -> ImageSize: max_side = vit_image_size * self.dynamic_hd return ImageSize(height=max_side, width=vit_image_size) - def get_audio_feature_nums(self, audio_len: int, sr: float): - if sr >= 16000: - win_length = 400 - hop_length = 160 + def get_audio_feature_nums(self, audio_len: int, sr: float) -> int: + """ + Compute the output size of the `extract_features` method. + + Args: + audio_len (int): Length of the input waveform in samples. + sr (float): Sampling rate of the waveform, either 16000 or 8000. + + Returns: + tuple (int, int): Output size as (T, D), where: + T: Number of time frames. + D: Number of Mel filterbank bins (80). + """ + + # Resample to 16000 or 8000 if needed + if sr > 16000: + audio_len //= sr // 16000 elif 8000 <= sr < 16000: - win_length = 200 - hop_length = 80 - else: - raise RuntimeError(f"Input data using an unsupported sample rate: {sr}") + # We'll resample to 16K from 8K + audio_len *= 2 + elif sr < 8000: + raise RuntimeError(f"Unsupported sample rate {sr}") + + # Spectrogram parameters for 16 kHz + win_length = 400 # Frame length in samples + hop_length = 160 # Frame shift in samples - # Spec 1: SpeechLib cut remaining sample insufficient for a hop - return (audio_len - win_length) // hop_length + 1 + # Calculate number of frames (T) + T = (audio_len - win_length) // hop_length + 1 + if T < 1: + raise ValueError("Waveform too short for given parameters.") + + # Return time frames (T) + return T class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): @@ -870,6 +838,7 @@ def _get_prompt_updates( ) -> Sequence[PromptUpdate]: image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore + feature_extractor = self.info.get_feature_extractor() def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( @@ -892,7 +861,7 @@ def get_audio_replacement_phi4mm(item_idx: int): audios = mm_items.get_items("audio", AudioProcessorItems) # TODO(Isotr0py): support embedding inputs audio_len = audios.get_audio_length(item_idx) - audio_frames, _ = compute_logfbank_output_size(audio_len, DUMMY_SAMPLING_FREQUENCY) + audio_frames = self.info.get_audio_feature_nums(audio_len, feature_extractor.sampling_rate) audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames) audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size From fb6b659d9a5d5b74c7765fb71f89f8510d2fb18f Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 00:17:26 +0800 Subject: [PATCH 19/36] clean up Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language.py | 2 +- .../vision_language_multi_image.py | 2 +- vllm/model_executor/models/phi4mm.py | 310 ++++++++---------- 3 files changed, 141 insertions(+), 173 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index c1115708505a..754e63a68428 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -724,7 +724,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=4096, + max_model_len=12800, max_num_seqs=2, enable_lora=True, max_lora_rank=320, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 2fb85c597974..87a988ee345b 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -392,7 +392,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=12800, + max_model_len=25600, max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, enable_lora=True, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 8b145cb68cde..194e82448746 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -65,50 +65,6 @@ } -def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, - image_size): - best_ratio_diff = float('inf') - best_ratio = (1, 1) - area = width * height - for ratio in target_ratios: - target_aspect_ratio = ratio[0] / ratio[1] - ratio_diff = abs(aspect_ratio - target_aspect_ratio) - if ratio_diff < best_ratio_diff: - best_ratio_diff = ratio_diff - best_ratio = ratio - elif ratio_diff == best_ratio_diff: - if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: - best_ratio = ratio - return best_ratio - - -def _find_target_aspect_ratio(orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,): - - w_crop_num = math.ceil(orig_width / float(image_size)) - h_crop_num = math.ceil(orig_height / float(image_size)) - if w_crop_num * h_crop_num > max_num: - aspect_ratio = orig_width / orig_height - - # calculate the existing image aspect ratio - target_ratios = set((i, j) for i in range(1, max_num + 1) - for j in range(1, max_num + 1) - if i * j <= max_num and i * j >= min_num) - target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) - - # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, target_ratios, orig_width, orig_height, image_size) - - # calculate the target width and height - target_width = image_size * target_aspect_ratio[0] - target_height = image_size * target_aspect_ratio[1] - else: - target_width = image_size * w_crop_num - target_height = image_size * h_crop_num - target_aspect_ratio = (w_crop_num, h_crop_num) - return target_aspect_ratio, target_height, target_width - - def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int): ratio_width = target_width / orig_width ratio_height = target_height / orig_height @@ -494,110 +450,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict): Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs] -def _compute_num_image_tokens( - orig_width: int, - orig_height: int, - dynamic_hd_size: int, - vit_image_size: int, - vit_patch_size: int, - token_compression_factor: int = 2, -): - """ - compute the number of tokens an image is expected to take up considering - the image encoder architecture and exclude output features containing - only padding pixels - - for siglip, vit_image_size=448, vit_patch_size=14, so output will be - 32x32 feature map - NOTE right now, Phi4MM uses hard-coded token_compression_factor=2 - """ - assert vit_image_size % vit_patch_size == 0, \ - "vit_image_size must be divisible by vit_patch_size" - assert vit_image_size // vit_patch_size % token_compression_factor == 0, \ - "vit_image_size // vit_patch_size must be divisible by "\ - "token_compression_factor" - - target_aspect_ratio, target_height, target_width = ( - _find_target_aspect_ratio(orig_width, - orig_height, - vit_image_size, - dynamic_hd_size, - min_num=1)) - assert target_aspect_ratio[ - 0] * vit_image_size == target_width, \ - f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}" - assert target_aspect_ratio[ - 1] * vit_image_size == target_height, \ - f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}" - assert (target_height % vit_image_size == 0 - and target_width % vit_image_size == 0) - - padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height, - target_width) - assert padding_width == 0 or padding_height == 0, \ - "padding_width or padding_height must be 0" - - target_feat_width = target_width // vit_patch_size - target_feat_height = target_height // vit_patch_size - if padding_width >= vit_patch_size: - assert padding_height == 0, "padding_height not 0" - non_pad_feat_width = target_feat_width - math.floor( - padding_width / vit_patch_size) - non_pad_feat_height = target_feat_height - elif padding_height >= vit_patch_size: - assert padding_width == 0, "padding_width not 0" - non_pad_feat_height = target_feat_height - math.floor( - padding_height / vit_patch_size) - non_pad_feat_width = target_feat_width - else: - # small padding shorter than a vit patch - non_pad_feat_width = target_feat_width - non_pad_feat_height = target_feat_height - - feat_width = non_pad_feat_width // token_compression_factor - feat_height = non_pad_feat_height // token_compression_factor - # NOTE it's possible that the non-padding feature is not divisible - if non_pad_feat_width % token_compression_factor != 0: - feat_width += 1 - if non_pad_feat_height % token_compression_factor != 0: - feat_height += 1 - num_hd_patch_tokens = feat_width * feat_height - num_hd_newline_tokens = feat_height - vit_feature_size = vit_image_size // vit_patch_size - num_global_image_tokens = (vit_feature_size // token_compression_factor)**2 - num_sep_tokens = 1 - num_global_image_newline_tokens = \ - vit_feature_size // token_compression_factor - - return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens + - num_hd_newline_tokens + num_global_image_newline_tokens) - - -def _compute_audio_embed_size(hf_config: PretrainedConfig, audio_frames: int) -> int: - """ - Compute the audio embedding size based on the audio frames and - compression rate. - """ - compression_rate = hf_config.embd_layer['audio_embd_layer'][ - 'compression_rate'] - # NOTE: this is a hard-coded value but might be configurable in the future - qformer_compression_rate = 1 - integer = audio_frames // compression_rate - remainder = audio_frames % compression_rate - - result = integer if remainder == 0 else integer + 1 - - integer = result // qformer_compression_rate - remainder = result % qformer_compression_rate - result = integer if remainder == 0 else integer + 1 # qformer compression - - return result - - -def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int: - return 10000 - - def cat_with_pad(tensors, dim, padding_value=0): """ cat along dim, while pad to max for all other dims @@ -656,12 +508,119 @@ def get_mm_max_tokens_per_item( } def get_max_audio_tokens(self) -> int: - return 188 + sr = self.get_feature_extractor().sampling_rate + num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr) + return self._compute_audio_embed_size(num_frames) def get_max_image_tokens(self) -> int: target_width, target_height = self.get_image_size_with_most_features() return self.get_num_image_tokens( image_width=target_width, image_height=target_height) + + def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,): + w_crop_num = math.ceil(orig_width / float(image_size)) + h_crop_num = math.ceil(orig_height / float(image_size)) + if w_crop_num * h_crop_num > max_num: + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set((i, j) for i in range(1, max_num + 1) + for j in range(1, max_num + 1) + if i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + image_processor = self.get_hf_processor().image_processor + target_aspect_ratio = image_processor.find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size,) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + else: + target_width = image_size * w_crop_num + target_height = image_size * h_crop_num + target_aspect_ratio = (w_crop_num, h_crop_num) + return target_aspect_ratio, target_height, target_width + + def _compute_num_image_tokens( + self, + orig_width: int, + orig_height: int, + dynamic_hd_size: int, + vit_image_size: int, + vit_patch_size: int, + token_compression_factor: int = 2, + ): + """ + compute the number of tokens an image is expected to take up considering + the image encoder architecture and exclude output features containing + only padding pixels + + for siglip, vit_image_size=448, vit_patch_size=14, so output will be + 32x32 feature map + NOTE right now, Phi4MM uses hard-coded token_compression_factor=2 + """ + assert vit_image_size % vit_patch_size == 0, \ + "vit_image_size must be divisible by vit_patch_size" + assert vit_image_size // vit_patch_size % token_compression_factor == 0, \ + "vit_image_size // vit_patch_size must be divisible by "\ + "token_compression_factor" + + target_aspect_ratio, target_height, target_width = ( + self._find_target_aspect_ratio(orig_width, + orig_height, + vit_image_size, + dynamic_hd_size, + min_num=1)) + assert target_aspect_ratio[ + 0] * vit_image_size == target_width, \ + f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}" + assert target_aspect_ratio[ + 1] * vit_image_size == target_height, \ + f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}" + assert (target_height % vit_image_size == 0 + and target_width % vit_image_size == 0) + + padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height, + target_width) + assert padding_width == 0 or padding_height == 0, \ + "padding_width or padding_height must be 0" + + target_feat_width = target_width // vit_patch_size + target_feat_height = target_height // vit_patch_size + if padding_width >= vit_patch_size: + assert padding_height == 0, "padding_height not 0" + non_pad_feat_width = target_feat_width - math.floor( + padding_width / vit_patch_size) + non_pad_feat_height = target_feat_height + elif padding_height >= vit_patch_size: + assert padding_width == 0, "padding_width not 0" + non_pad_feat_height = target_feat_height - math.floor( + padding_height / vit_patch_size) + non_pad_feat_width = target_feat_width + else: + # small padding shorter than a vit patch + non_pad_feat_width = target_feat_width + non_pad_feat_height = target_feat_height + + feat_width = non_pad_feat_width // token_compression_factor + feat_height = non_pad_feat_height // token_compression_factor + # NOTE it's possible that the non-padding feature is not divisible + if non_pad_feat_width % token_compression_factor != 0: + feat_width += 1 + if non_pad_feat_height % token_compression_factor != 0: + feat_height += 1 + num_hd_patch_tokens = feat_width * feat_height + num_hd_newline_tokens = feat_height + vit_feature_size = vit_image_size // vit_patch_size + num_global_image_tokens = (vit_feature_size // token_compression_factor)**2 + num_sep_tokens = 1 + num_global_image_newline_tokens = \ + vit_feature_size // token_compression_factor + + return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens + + num_hd_newline_tokens + num_global_image_newline_tokens) def get_num_image_tokens( self, @@ -680,7 +639,7 @@ def get_num_image_tokens( dynamic_hd_size = self.dynamic_hd - image_num_tokens = _compute_num_image_tokens( + image_num_tokens = self._compute_num_image_tokens( image_width, image_height, dynamic_hd_size=dynamic_hd_size, vit_image_size=vit_image_size, @@ -701,7 +660,7 @@ def get_image_size_with_most_features(self) -> ImageSize: max_side = vit_image_size * self.dynamic_hd return ImageSize(height=max_side, width=vit_image_size) - def get_audio_feature_nums(self, audio_len: int, sr: float) -> int: + def get_audio_num_frames(self, audio_len: int, sr: float) -> int: """ Compute the output size of the `extract_features` method. @@ -729,12 +688,34 @@ def get_audio_feature_nums(self, audio_len: int, sr: float) -> int: hop_length = 160 # Frame shift in samples # Calculate number of frames (T) - T = (audio_len - win_length) // hop_length + 1 - if T < 1: + num_frames = (audio_len - win_length) // hop_length + 1 + if num_frames < 1: raise ValueError("Waveform too short for given parameters.") # Return time frames (T) - return T + return num_frames + + def _compute_audio_embed_size(self, audio_frames: int) -> int: + """ + Compute the audio embedding size based on the audio frames and + compression rate. + """ + hf_config = self.get_hf_config() + compression_rate = hf_config.embd_layer['audio_embd_layer'][ + 'compression_rate'] + # NOTE: this is a hard-coded value but might be configurable + # in the future + qformer_compression_rate = 1 + integer = audio_frames // compression_rate + remainder = audio_frames % compression_rate + + result = integer if remainder == 0 else integer + 1 + + integer = result // qformer_compression_rate + remainder = result % qformer_compression_rate + result = integer if remainder == 0 else integer + 1 # qformer compression + + return result class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): @@ -806,7 +787,7 @@ def _call_hf_processor( processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds') if "audios" in mm_data: audio_features = processed_outputs['input_audio_embeds'] - feature_sizes = [self.info.get_audio_feature_nums(len(audio), sr) for audio, sr in mm_data['audios']] + feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio, sr in mm_data['audios']] processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] else: tokenizer = self.info.get_tokenizer() @@ -861,8 +842,8 @@ def get_audio_replacement_phi4mm(item_idx: int): audios = mm_items.get_items("audio", AudioProcessorItems) # TODO(Isotr0py): support embedding inputs audio_len = audios.get_audio_length(item_idx) - audio_frames = self.info.get_audio_feature_nums(audio_len, feature_extractor.sampling_rate) - audio_embed_size = _compute_audio_embed_size(self.info.get_hf_config(), audio_frames) + audio_frames = self.info.get_audio_num_frames(audio_len, feature_extractor.sampling_rate) + audio_embed_size = self.info._compute_audio_embed_size(audio_frames) audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size @@ -888,16 +869,6 @@ def get_audio_replacement_phi4mm(item_idx: int): return image_repl + audio_repl -# @MULTIMODAL_REGISTRY.register_input_mapper("audio", -# input_mapper_for_phi4mm_audio) -# @MULTIMODAL_REGISTRY.register_input_mapper("image", -# input_mapper_for_phi4mm_image) -# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -# "audio", get_max_phi4mm_audio_tokens) -# @MULTIMODAL_REGISTRY.register_max_multimodal_tokens( -# "image", get_max_phi4mm_image_tokens) -# @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm) -# @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm) @MULTIMODAL_REGISTRY.register_processor( Phi4MMMultiModalProcessor, info=Phi4MMProcessingInfo, @@ -1055,10 +1026,7 @@ def _parse_and_validate_audio_input( if isinstance(audio_features, torch.Tensor): assert audio_features.size(0) == len(audio_embed_sizes), ( "audio_features and audio_embed_sizes must have the same length") - elif is_list_of(audio_features, torch.Tensor): - assert len(audio_features) == len(audio_embed_sizes), ( - "audio_features and audio_embed_sizes must have the same length") - elif is_list_of(audio_features, list): + elif is_list_of(audio_features, (torch.Tensor, list)): assert len(audio_features) == len(audio_embed_sizes), ( "audio_features and audio_embed_sizes must have the same length") else: From 9fab0e45b978a6e88a1705ad803ad7f244ec446e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 00:58:57 +0800 Subject: [PATCH 20/36] minor refactor Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/audio_language.py | 2 +- vllm/model_executor/models/phi4mm.py | 67 ++++++++++---------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 840892ea0701..fff06e466359 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=4096, + max_model_len=12800, max_num_seqs=2, enable_lora=True, max_lora_rank=320, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 194e82448746..eac8c3fb57f6 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -774,26 +774,27 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: - if mm_data: - if "audios" in mm_data: - sr = self.info.get_feature_extractor().sampling_rate - mm_data['audios'] = [(data, sr) for data in mm_data['audios']] - processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) - num_img_tokens = [ - self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1]) - for img_size in processed_outputs["image_sizes"] - ] - processed_outputs["num_img_tokens"] = num_img_tokens - processed_outputs["pixel_values"] = processed_outputs.pop('input_image_embeds') - if "audios" in mm_data: - audio_features = processed_outputs['input_audio_embeds'] - feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio, sr in mm_data['audios']] - processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] - else: - tokenizer = self.info.get_tokenizer() - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") + if not mm_data: + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + sr = self.info.get_feature_extractor().sampling_rate + if (audio_data := mm_data.get("audios", [])): + mm_data['audios'] = [(data, sr) for data in audio_data] + + processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) + + num_img_tokens = [ + self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1]) + for img_size in processed_outputs["image_sizes"] + ] + processed_outputs["num_img_tokens"] = num_img_tokens + + audio_features = processed_outputs['input_audio_embeds'] + feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data] + processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] + return processed_outputs def _get_mm_fields_config( @@ -802,7 +803,7 @@ def _get_mm_fields_config( hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: return dict( - pixel_values=MultiModalFieldConfig.batched("image"), + input_image_embeds=MultiModalFieldConfig.batched("image"), image_attention_mask=MultiModalFieldConfig.batched("image"), image_sizes=MultiModalFieldConfig.batched("image"), num_img_tokens=MultiModalFieldConfig.batched("image"), @@ -1076,8 +1077,8 @@ def _process_audio_input(self, def _parse_and_validate_image_input(self, **kwargs: object) -> Optional[Dict]: - pixel_values: NestedTensors = kwargs.get("pixel_values") - if pixel_values is None: + input_image_embeds: NestedTensors = kwargs.get("input_image_embeds") + if input_image_embeds is None: return None image_sizes = kwargs.get("image_sizes") @@ -1086,23 +1087,23 @@ def _parse_and_validate_image_input(self, assert image_sizes is not None and image_attention_mask is not None\ and num_img_tokens is not None, "Missing image inputs" - if is_list_of(pixel_values, torch.Tensor): - assert all(p.dim() == 5 for p in pixel_values), "Incorrect image inputs" + if is_list_of(input_image_embeds, torch.Tensor): + assert all(p.dim() == 5 for p in input_image_embeds), "Incorrect image inputs" # list len is batch_size. # each tensor has dimension: num_img_per_example, num_hd_patches, # channels, height, width. # need to pad along num_hd_patches. # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w. - pixel_values = cat_with_pad(pixel_values, dim=0) - elif isinstance(pixel_values, torch.Tensor): + input_image_embeds = cat_with_pad(input_image_embeds, dim=0) + elif isinstance(input_image_embeds, torch.Tensor): # dimension: batch_size, num_img_per_example, num_hd_patches, # channels, height, width. # we flatten first 2 dims to make it a single large batch for # SigLIP Encoder. - assert pixel_values.dim() == 6, "Incorrect image inputs" - pixel_values = pixel_values.flatten(0, 1) + assert input_image_embeds.dim() == 6, "Incorrect image inputs" + input_image_embeds = input_image_embeds.flatten(0, 1) else: - raise ValueError("Incorrect pixel_values inputs") + raise ValueError("Incorrect input_image_embeds inputs") if isinstance(image_attention_mask, list): image_attention_mask = cat_with_pad(image_attention_mask, dim=0) @@ -1129,8 +1130,8 @@ def _parse_and_validate_image_input(self, raise ValueError("Incorrect image_attention_mask inputs") return Phi4MMImagePixelInputs( - type="pixel_values_videos", - data=pixel_values, + type="pixel_values", + data=input_image_embeds, image_sizes=image_sizes, image_attention_mask=image_attention_mask, num_img_tokens=num_img_tokens, @@ -1142,7 +1143,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key in ("pixel_values", + if input_key in ("input_image_embeds", "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) From 660cfd79982bd8c93700581754ba13d4e2c9bc4a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 01:25:34 +0800 Subject: [PATCH 21/36] minor fix Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index eac8c3fb57f6..eccc456d9bae 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -395,7 +395,7 @@ def forward(self, pixel_values: torch.FloatTensor, for _output_img in output_imgs: img_feature_proj = self.img_projection( _output_img.to(target_device).to(target_dtype)) - img_set_tensor.append(img_feature_proj) + img_set_tensor.append(img_feature_proj.squeeze(0)) return img_set_tensor From 341a8f99b5ac48eac85b3f3efe4443385a766073 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 01:38:30 +0800 Subject: [PATCH 22/36] code format Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 208 +++++++++++++++------------ 1 file changed, 120 insertions(+), 88 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index eccc456d9bae..058255c583a3 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,29 +1,21 @@ # SPDX-License-Identifier: Apache-2.0 import math -import re from collections.abc import Iterable, Mapping, Sequence -from functools import lru_cache -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, - TypedDict, Union) +from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union import numpy as np import numpy.typing as npt import scipy.signal import torch import torch.nn as nn -import torchvision.transforms as T -from PIL import Image -from transformers import PretrainedConfig, SiglipVisionConfig, ProcessorMixin, BatchFeature, SequenceFeatureExtractor -from transformers.utils import logging +from transformers import (BatchFeature, PretrainedConfig, + SequenceFeatureExtractor, SiglipVisionConfig) from vllm.config import VllmConfig from vllm.distributed import get_pp_group -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext) -from vllm.inputs.data import TokenInputs, token_inputs from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput, get_sampler +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.models.llama import LlamaModel @@ -31,21 +23,22 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors, MultiModalInputs) -from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, MultiModalDataParser, - ImageSize, MultiModalDataItems, AudioEmbeddingItems, AudioProcessorItems) + NestedTensors) +from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems, + ImageProcessorItems, ImageSize, + MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, - PromptUpdate, PromptUpdateDetails) + PromptUpdate) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config +from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .idefics2_vision_model import Idefics2VisionTransformer -from .interfaces import SupportsLoRA, SupportsMultiModal, MultiModalEmbeddings +from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal from .phi4mm_audio import AudioEmbedding -from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix, merge_multimodal_embeddings, flatten_bn +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, + merge_multimodal_embeddings) # <|endoftext10|> (see vocab.json in hf model) _IMAGE_PLACEHOLDER_TOKEN_ID = 200010 @@ -65,7 +58,8 @@ } -def _get_padding_size(orig_width: int, orig_height: int, target_height: int, target_width: int): +def _get_padding_size(orig_width: int, orig_height: int, target_height: int, + target_width: int): ratio_width = target_width / orig_width ratio_height = target_height / orig_height @@ -485,12 +479,12 @@ def image_tokens(self) -> list[str]: @property def audio_tokens(self) -> list[str]: return [f"<|audio_{i+1}|>" for i in range(100)] - + @property def dynamic_hd(self) -> int: image_processor = self.get_hf_processor().image_processor return image_processor.dynamic_hd - + def get_feature_extractor(self) -> SequenceFeatureExtractor: return self.get_hf_processor().audio_processor @@ -511,13 +505,20 @@ def get_max_audio_tokens(self) -> int: sr = self.get_feature_extractor().sampling_rate num_frames = self.get_audio_num_frames(_AUDIO_MAX_SOUNDFILE_SIZE, sr) return self._compute_audio_embed_size(num_frames) - + def get_max_image_tokens(self) -> int: target_width, target_height = self.get_image_size_with_most_features() - return self.get_num_image_tokens( - image_width=target_width, image_height=target_height) - - def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_size: int, max_num: int, min_num: int,): + return self.get_num_image_tokens(image_width=target_width, + image_height=target_height) + + def _find_target_aspect_ratio( + self, + orig_width: int, + orig_height: int, + image_size: int, + max_num: int, + min_num: int, + ): w_crop_num = math.ceil(orig_width / float(image_size)) h_crop_num = math.ceil(orig_height / float(image_size)) if w_crop_num * h_crop_num > max_num: @@ -532,7 +533,12 @@ def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_siz # find the closest aspect ratio to the target image_processor = self.get_hf_processor().image_processor target_aspect_ratio = image_processor.find_closest_aspect_ratio( - aspect_ratio, target_ratios, orig_width, orig_height, image_size,) + aspect_ratio, + target_ratios, + orig_width, + orig_height, + image_size, + ) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] @@ -542,7 +548,7 @@ def _find_target_aspect_ratio(self, orig_width: int, orig_height: int, image_siz target_height = image_size * h_crop_num target_aspect_ratio = (w_crop_num, h_crop_num) return target_aspect_ratio, target_height, target_width - + def _compute_num_image_tokens( self, orig_width: int, @@ -553,7 +559,7 @@ def _compute_num_image_tokens( token_compression_factor: int = 2, ): """ - compute the number of tokens an image is expected to take up considering + compute the number of tokens an image is expected to take up considering the image encoder architecture and exclude output features containing only padding pixels @@ -561,29 +567,28 @@ def _compute_num_image_tokens( 32x32 feature map NOTE right now, Phi4MM uses hard-coded token_compression_factor=2 """ - assert vit_image_size % vit_patch_size == 0, \ - "vit_image_size must be divisible by vit_patch_size" - assert vit_image_size // vit_patch_size % token_compression_factor == 0, \ - "vit_image_size // vit_patch_size must be divisible by "\ - "token_compression_factor" + assert vit_image_size % vit_patch_size == 0, ( + "vit_image_size must be divisible by vit_patch_size") + assert (vit_image_size // vit_patch_size % + token_compression_factor == 0), ( + "vit_image_size // vit_patch_size must be divisible by " + "token_compression_factor") target_aspect_ratio, target_height, target_width = ( self._find_target_aspect_ratio(orig_width, - orig_height, - vit_image_size, - dynamic_hd_size, - min_num=1)) - assert target_aspect_ratio[ - 0] * vit_image_size == target_width, \ - f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}" - assert target_aspect_ratio[ - 1] * vit_image_size == target_height, \ - f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}" + orig_height, + vit_image_size, + dynamic_hd_size, + min_num=1)) + assert target_aspect_ratio[0] * vit_image_size == target_width, ( + f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}") + assert target_aspect_ratio[1] * vit_image_size == target_height, ( + f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}") assert (target_height % vit_image_size == 0 and target_width % vit_image_size == 0) - padding_height, padding_width = _get_padding_size(orig_width, orig_height, target_height, - target_width) + padding_height, padding_width = _get_padding_size( + orig_width, orig_height, target_height, target_width) assert padding_width == 0 or padding_height == 0, \ "padding_width or padding_height must be 0" @@ -614,13 +619,15 @@ def _compute_num_image_tokens( num_hd_patch_tokens = feat_width * feat_height num_hd_newline_tokens = feat_height vit_feature_size = vit_image_size // vit_patch_size - num_global_image_tokens = (vit_feature_size // token_compression_factor)**2 + num_global_image_tokens = (vit_feature_size // + token_compression_factor)**2 num_sep_tokens = 1 num_global_image_newline_tokens = \ vit_feature_size // token_compression_factor - return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens + - num_hd_newline_tokens + num_global_image_newline_tokens) + return (num_global_image_tokens + num_sep_tokens + + num_hd_patch_tokens + num_hd_newline_tokens + + num_global_image_newline_tokens) def get_num_image_tokens( self, @@ -632,7 +639,8 @@ def get_num_image_tokens( vision_encoder_name = hf_config.img_processor if vision_encoder_name is None: vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] + prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[ + vision_encoder_name] vit_image_size = prepro_config['vit_image_size'] vit_patch_size = prepro_config['vit_patch_size'] token_compression_factor = prepro_config['token_compression_factor'] @@ -640,7 +648,8 @@ def get_num_image_tokens( dynamic_hd_size = self.dynamic_hd image_num_tokens = self._compute_num_image_tokens( - image_width, image_height, + image_width, + image_height, dynamic_hd_size=dynamic_hd_size, vit_image_size=vit_image_size, vit_patch_size=vit_patch_size, @@ -654,12 +663,13 @@ def get_image_size_with_most_features(self) -> ImageSize: vision_encoder_name = hf_config.img_processor if vision_encoder_name is None: vision_encoder_name = SIGLIP_NAME - prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name] + prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[ + vision_encoder_name] vit_image_size = prepro_config['vit_image_size'] max_side = vit_image_size * self.dynamic_hd return ImageSize(height=max_side, width=vit_image_size) - + def get_audio_num_frames(self, audio_len: int, sr: float) -> int: """ Compute the output size of the `extract_features` method. @@ -694,7 +704,7 @@ def get_audio_num_frames(self, audio_len: int, sr: float) -> int: # Return time frames (T) return num_frames - + def _compute_audio_embed_size(self, audio_frames: int) -> int: """ Compute the audio embedding size based on the audio frames and @@ -703,7 +713,7 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int: hf_config = self.get_hf_config() compression_rate = hf_config.embd_layer['audio_embd_layer'][ 'compression_rate'] - # NOTE: this is a hard-coded value but might be configurable + # NOTE: this is a hard-coded value but might be configurable # in the future qformer_compression_rate = 1 integer = audio_frames // compression_rate @@ -713,7 +723,8 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int: integer = result // qformer_compression_rate remainder = result % qformer_compression_rate - result = integer if remainder == 0 else integer + 1 # qformer compression + # qformer compression + result = integer if remainder == 0 else integer + 1 return result @@ -736,15 +747,16 @@ def get_dummy_processor_inputs( self._get_dummy_images(width=target_width, height=target_height, num_images=num_images), - "audio": self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, - num_audios=num_audios), + "audio": + self._get_dummy_audios(length=_AUDIO_MAX_SOUNDFILE_SIZE, + num_audios=num_audios), } image_tokens: list[str] = self.info.image_tokens[:num_images] audio_tokens: list[str] = self.info.audio_tokens[:num_audios] return ProcessorInputs( - prompt_text="".join(image_tokens+audio_tokens), + prompt_text="".join(image_tokens + audio_tokens), mm_data=mm_data, ) @@ -760,13 +772,16 @@ def scipy_resample_audio( target_sr: float, ): if orig_sr > target_sr: - return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr) + return scipy.signal.resample_poly(audio, 1, + orig_sr // target_sr) elif orig_sr < target_sr: - return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1) + return scipy.signal.resample_poly(audio, target_sr // orig_sr, + 1) return audio feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, resample_func=scipy_resample_audio) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, + resample_func=scipy_resample_audio) def _call_hf_processor( self, @@ -783,18 +798,26 @@ def _call_hf_processor( if (audio_data := mm_data.get("audios", [])): mm_data['audios'] = [(data, sr) for data in audio_data] - processed_outputs = super()._call_hf_processor(prompt, mm_data, mm_kwargs) + processed_outputs = super()._call_hf_processor(prompt, mm_data, + mm_kwargs) num_img_tokens = [ - self.info.get_num_image_tokens(image_width=img_size[0], image_height=img_size[1]) + self.info.get_num_image_tokens(image_width=img_size[0], + image_height=img_size[1]) for img_size in processed_outputs["image_sizes"] ] processed_outputs["num_img_tokens"] = num_img_tokens audio_features = processed_outputs['input_audio_embeds'] - feature_sizes = [self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data] - processed_outputs['input_audio_embeds'] = [audio_features[idx, :size] for idx, size in enumerate(feature_sizes)] - + feature_sizes = [ + self.info.get_audio_num_frames(len(audio), sr) + for audio in audio_data + ] + processed_outputs['input_audio_embeds'] = [ + audio_features[idx, :size] + for idx, size in enumerate(feature_sizes) + ] + return processed_outputs def _get_mm_fields_config( @@ -843,8 +866,10 @@ def get_audio_replacement_phi4mm(item_idx: int): audios = mm_items.get_items("audio", AudioProcessorItems) # TODO(Isotr0py): support embedding inputs audio_len = audios.get_audio_length(item_idx) - audio_frames = self.info.get_audio_num_frames(audio_len, feature_extractor.sampling_rate) - audio_embed_size = self.info._compute_audio_embed_size(audio_frames) + audio_frames = self.info.get_audio_num_frames( + audio_len, feature_extractor.sampling_rate) + audio_embed_size = self.info._compute_audio_embed_size( + audio_frames) audio_tokens = [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size @@ -1026,10 +1051,12 @@ def _parse_and_validate_audio_input( assert isinstance(audio_embed_sizes, torch.Tensor) if isinstance(audio_features, torch.Tensor): assert audio_features.size(0) == len(audio_embed_sizes), ( - "audio_features and audio_embed_sizes must have the same length") + "audio_features and audio_embed_sizes " + "must have the same length") elif is_list_of(audio_features, (torch.Tensor, list)): assert len(audio_features) == len(audio_embed_sizes), ( - "audio_features and audio_embed_sizes must have the same length") + "audio_features and audio_embed_sizes " + "must have the same length") else: raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") @@ -1047,8 +1074,7 @@ def _parse_and_validate_audio_input( raise AssertionError("This line should be unreachable.") - def _process_audio_input(self, - audio_input: Phi4MMAudioInputs, + def _process_audio_input(self, audio_input: Phi4MMAudioInputs, audio_projection_mode: str) -> NestedTensors: """ Create the audio embeddings from the audio input, where the audio input @@ -1069,10 +1095,12 @@ def _process_audio_input(self, # (e.g. multiple audios in the same example) dtype = next(self.embed_tokens_extend.parameters()).dtype - audio_embeds = [self.embed_tokens_extend.get_audio_features( - features.unsqueeze(0).to(dtype), - audio_projection_mode=audio_projection_mode, - ).squeeze(0) for features in audio_features] + audio_embeds = [ + self.embed_tokens_extend.get_audio_features( + features.unsqueeze(0).to(dtype), + audio_projection_mode=audio_projection_mode, + ).squeeze(0) for features in audio_features + ] return audio_embeds def _parse_and_validate_image_input(self, @@ -1088,7 +1116,8 @@ def _parse_and_validate_image_input(self, and num_img_tokens is not None, "Missing image inputs" if is_list_of(input_image_embeds, torch.Tensor): - assert all(p.dim() == 5 for p in input_image_embeds), "Incorrect image inputs" + assert all(p.dim() == 5 + for p in input_image_embeds), "Incorrect image inputs" # list len is batch_size. # each tensor has dimension: num_img_per_example, num_hd_patches, # channels, height, width. @@ -1153,8 +1182,9 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: **kwargs) return modalities - - def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]: + + def _process_image_input( + self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]: if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: @@ -1162,10 +1192,10 @@ def _process_image_input(self, image_input: Phi4MMImagePixelInputs) -> list[torc pixel_values = image_input['data'].to(dtype) image_sizes = image_input['image_sizes'] image_attention_mask = image_input['image_attention_mask'] - image_embeds = self.vision_encoder( - pixel_values, image_sizes, image_attention_mask) + image_embeds = self.vision_encoder(pixel_values, image_sizes, + image_attention_mask) return image_embeds - + def get_multimodal_embeddings( self, **kwargs: object) -> Optional[MultiModalEmbeddings]: @@ -1189,7 +1219,8 @@ def get_multimodal_embeddings( multimodal_embeddings += tuple(vision_embeddings) if modality == "audios": audio_input = modalities["audios"] - audio_embeddings = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode) + audio_embeddings = self._process_audio_input( + audio_input, audio_projection_mode=audio_projection_mode) multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings @@ -1205,7 +1236,7 @@ def get_input_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID]) return inputs_embeds - + def get_input_embeddings_v0( self, input_ids: torch.Tensor, @@ -1225,7 +1256,8 @@ def get_input_embeddings_v0( audio_projection_mode = 'vision' if audio_input is not None: - audio_embeds = self._process_audio_input(audio_input, audio_projection_mode=audio_projection_mode) + audio_embeds = self._process_audio_input( + audio_input, audio_projection_mode=audio_projection_mode) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, From 335a29ee53ee361d9bc077bda494f56095e912e8 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 02:02:43 +0800 Subject: [PATCH 23/36] refactor audio resample Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 19 +---------- vllm/multimodal/audio.py | 51 ++++++++++++++++++++++++++-- vllm/multimodal/parse.py | 30 ++++++++-------- 3 files changed, 65 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 058255c583a3..4c619c13aaca 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -4,8 +4,6 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union import numpy as np -import numpy.typing as npt -import scipy.signal import torch import torch.nn as nn from transformers import (BatchFeature, PretrainedConfig, @@ -764,24 +762,9 @@ def get_dummy_processor_inputs( class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: - - def scipy_resample_audio( - audio: npt.NDArray[np.floating], - *, - orig_sr: float, - target_sr: float, - ): - if orig_sr > target_sr: - return scipy.signal.resample_poly(audio, 1, - orig_sr // target_sr) - elif orig_sr < target_sr: - return scipy.signal.resample_poly(audio, target_sr // orig_sr, - 1) - return audio - feature_extractor = self.info.get_feature_extractor() return MultiModalDataParser(target_sr=feature_extractor.sampling_rate, - resample_func=scipy_resample_audio) + audio_resample_method="scipy") def _call_hf_processor( self, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index f379ec1682a3..0359a1324ef1 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 - import base64 from io import BytesIO from pathlib import Path +from typing import Literal, Optional import numpy as np import numpy.typing as npt +import scipy.signal from vllm.inputs.registry import InputContext from vllm.utils import PlaceholderModule @@ -43,7 +44,7 @@ def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: "There is no default maximum multimodal tokens") -def resample_audio( +def resample_audio_librosa( audio: npt.NDArray[np.floating], *, orig_sr: float, @@ -52,6 +53,52 @@ def resample_audio( return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) +def resample_audio_scipy( + audio: npt.NDArray[np.floating], + *, + orig_sr: float, + target_sr: float, +): + if orig_sr > target_sr: + return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr) + elif orig_sr < target_sr: + return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1) + return audio + + +class AudioResampler: + """Resample audio data to a target sample rate.""" + + def __init__( + self, + target_sr: Optional[float] = None, + method: Literal["librosa", "scipy"] = "librosa", + ): + self.target_sr = target_sr + self.method = method + + def resample( + self, + audio: npt.NDArray[np.floating], + *, + orig_sr: float, + ) -> npt.NDArray[np.floating]: + if self.target_sr is None: + raise RuntimeError("Audio resampling is not supported when " + "`target_sr` is not provided") + if self.method == "librosa": + return resample_audio_librosa(audio, + orig_sr=orig_sr, + target_sr=self.target_sr) + elif self.method == "scipy": + return resample_audio_scipy(audio, + orig_sr=orig_sr, + target_sr=self.target_sr) + else: + raise ValueError(f"Invalid resampling method: {self.method}. " + "Supported methods are 'librosa' and 'scipy'.") + + class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index e6963bd8e158..9707b9cfcf8b 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -3,8 +3,8 @@ from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence -from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar, - Union) +from typing import (TYPE_CHECKING, Any, Generic, Literal, NamedTuple, Optional, + TypeVar, Union) import numpy as np import torch @@ -14,7 +14,7 @@ from vllm.utils import is_list_of -from .audio import resample_audio +from .audio import AudioResampler from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, ImageItem, ModalityData, MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargs, VideoItem) @@ -308,11 +308,18 @@ class MultiModalDataParser: items to the model's expected sampling rate. """ - def __init__(self, *, target_sr: Optional[float] = None, resample_func: Optional[Callable] = None,) -> None: + def __init__( + self, + *, + target_sr: Optional[float] = None, + audio_resample_method: Literal["librosa", "scipy"] = "librosa", + ) -> None: super().__init__() - self.target_sr = target_sr - self.audio_resampler = resample_audio if resample_func is None else resample_func + self.audio_resampler = AudioResampler( + target_sr=target_sr, + method=audio_resample_method, + ) def _is_embeddings( self, data: object @@ -375,15 +382,8 @@ def _parse_audio_data( if orig_sr is None: new_audio = audio else: - target_sr = self.target_sr - if target_sr is None: - raise RuntimeError( - "Audio resampling is not supported when " - "`target_sr` is not provided") - - new_audio = self.audio_resampler(audio, - orig_sr=orig_sr, - target_sr=target_sr) + new_audio = self.audio_resampler.resample(audio, + orig_sr=orig_sr) new_audios.append(new_audio) From e755e6b9d40f230c8bb1103e90de12a456ad938d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 02:20:50 +0800 Subject: [PATCH 24/36] minor refactor audio encoder Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 51 ++------------- vllm/model_executor/models/phi4mm_audio.py | 75 ++++++---------------- 2 files changed, 23 insertions(+), 103 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 4c619c13aaca..759474e6b3ee 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -428,8 +428,8 @@ class Phi4MMImageEmbeddingInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: Tuple[NestedTensors] - """Shape: `((batch_size * num_audios, 80, M), )""" + data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size * num_audios, 80, M)""" class Phi4MMAudioEmbeddingInputs(TypedDict): @@ -969,47 +969,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size, logit_scale) self.sampler = get_sampler() - def _audio_features_to_embeddings( - self, - input_ids: torch.Tensor, - input_features: List[torch.Tensor], - audio_input_sizes: torch.Tensor, - audio_projection_mode: str, - ) -> torch.Tensor: - """ - Convert audio features to embeddings, which are used as input to the - model (via `inputs_embeds`). - - Args: - input_ids (torch.Tensor): Input IDs (the prompt in this case). - input_features (list[torch.Tensor]): Input features (the audio - embeddings). - audio_input_sizes (list[torch.Tensor]): Audio input sizes (the - audio embed lengths to use for padding the audio placeholder token - in the input prompt IDs). - """ - # The audio projection can either be a single linear or Sequential, - # so handle both cases - if isinstance(self.embed_tokens_extend.audio_projection, - nn.Sequential): - target_dtype = self.embed_tokens_extend.audio_projection[ - 0].bias.dtype - else: - target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype - - audio_input = [ - input.unsqueeze(0).to(target_dtype) for input in input_features - ] - kwargs = { - "wte": self.model.embed_tokens, - 'audio_projection_mode': audio_projection_mode - } - audio_embeddings = self.embed_tokens_extend(input_ids, audio_input, - audio_input_sizes, - **kwargs) - audio_embeddings = audio_embeddings.to(target_dtype) - return audio_embeddings - def _parse_and_validate_audio_input( self, **kwargs: object) -> Optional[Phi4MMAudioInputs]: """ @@ -1079,10 +1038,10 @@ def _process_audio_input(self, audio_input: Phi4MMAudioInputs, dtype = next(self.embed_tokens_extend.parameters()).dtype audio_embeds = [ - self.embed_tokens_extend.get_audio_features( - features.unsqueeze(0).to(dtype), + self.embed_tokens_extend( + features.to(dtype), audio_projection_mode=audio_projection_mode, - ).squeeze(0) for features in audio_features + ) for features in audio_features ] return audio_embeds diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index db90848f9809..34a7a73d057a 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -1159,8 +1159,11 @@ def get_audio_features( input_embeds: torch.FloatTensor, audio_attention_mask: torch.Tensor = None, audio_projection_mode: str = "speech", - ): - + ) -> torch.FloatTensor: + """ + arguments: + input_embeds: audio features (B, T, D) B: num audios in a sequence + """ if self.freeze_audio_processor: with torch.no_grad(): audio_features, masks = self.encoder(input_embeds, @@ -1210,62 +1213,20 @@ def get_audio_features( def forward( self, - input_ids: torch.LongTensor, - input_embeds: torch.FloatTensor, - audio_embed_sizes, - **kwargs, + audio_features: torch.FloatTensor, + audio_attention_mask: torch.Tensor = None, + audio_projection_mode: str = "speech", ) -> torch.FloatTensor: """ arguments: - input_ids: input text ids (B, U) - input_embeds: audio features (B, T, D) B: num audios in a sequence + audio_features: audio features (T, D) + + returns: + audio_embeds: audio embeddings (num_audio_tokens, hidden_dim) """ - assert input_embeds is not None and len(input_embeds) == len( - audio_embed_sizes) - - input_shape = input_ids.size() - input_ids = input_ids.view(-1, input_shape[-1]) - - with torch.no_grad(): - positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero( - as_tuple=False) - - if not isinstance(input_embeds, list): - input_embeds = [input_embeds] - - audio_projection_mode = kwargs.get("audio_projection_mode", "speech") - audio_set_tensor = [ - self.get_audio_features( - input_embed, audio_projection_mode=audio_projection_mode) - for input_embed in input_embeds - ] - - with torch.no_grad(): - input_ids.clamp_min_(0).clamp_max_(self.vocab_size) - - if "wte" in kwargs: - # we use the token embedding layer from the huggingface model, this - # is REQUIRED to make sure we are using the loaded weights. - hidden_states = kwargs["wte"](input_ids) - else: - # otherwise, we use token embedding in pretrained mixformer from - # phi team - hidden_states = self.wte(input_ids) - - if len(positions.tolist()) > 0: - assert sum(audio_embed_sizes) == len( - positions - ), "please ensure the encoder outputs have the same length as"\ - " defined in input_ids!" - idx = 0 - for i in range(len(audio_embed_sizes)): - cnt = audio_embed_sizes[i] - assert audio_set_tensor[i].shape[0] == 1 - hidden_states[ - positions[idx, 0], - positions[idx, 1]:positions[idx, 1] + cnt, - ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to( - hidden_states.dtype).to(hidden_states.device)) - idx += cnt - - return hidden_states + audio_embeds = self.get_audio_features( + audio_features.unsqueeze(0), + audio_attention_mask=audio_attention_mask, + audio_projection_mode=audio_projection_mode, + ) + return audio_embeds.squeeze(0) From 5714c181d542cc6242ddff82701c4c0c33d527bb Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 3 Apr 2025 16:15:41 +0800 Subject: [PATCH 25/36] increase test max_model_len Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/decoder_only/vision_language/test_phi4mm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py index c3e88b60978a..94ddb734b8f2 100644 --- a/tests/models/decoder_only/vision_language/test_phi4mm.py +++ b/tests/models/decoder_only/vision_language/test_phi4mm.py @@ -155,7 +155,7 @@ def run_test( ], ) @pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_model_len", [4096]) +@pytest.mark.parametrize("max_model_len", [12800]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, @@ -198,7 +198,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, ], ) @pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_model_len", [10000]) +@pytest.mark.parametrize("max_model_len", [25600]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.xfail( From d3dd9e0e1d633834a8c34f5184734f4138886a96 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 4 Apr 2025 00:15:54 +0800 Subject: [PATCH 26/36] add processor tests Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/multimodal/processing/test_common.py | 1 + vllm/model_executor/models/phi4mm.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index fdcd7a9e1738..51b961785c3a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -271,6 +271,7 @@ def _test_processing_correctness_mistral( "nvidia/NVLM-D-72B", "google/paligemma-3b-mix-224", "google/paligemma2-3b-ft-docci-448", + "microsoft/Phi-4-multimodal-instruct", "mistralai/Pixtral-12B-2409", "mistral-community/pixtral-12b", "Qwen/Qwen-VL-Chat", diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 759474e6b3ee..71b7cba56fdb 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -815,7 +815,6 @@ def _get_mm_fields_config( num_img_tokens=MultiModalFieldConfig.batched("image"), input_audio_embeds=MultiModalFieldConfig.batched("audio"), audio_embed_sizes=MultiModalFieldConfig.batched("audio"), - audio_attention_mask=MultiModalFieldConfig.batched("audio"), ) def _get_prompt_updates( From 5a505b8634847abba6aef2b192d42c1b0dc341dc Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 4 Apr 2025 00:30:55 +0800 Subject: [PATCH 27/36] revert unnecessary changes Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/phi4mm.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 71b7cba56fdb..435e30e3166b 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -814,7 +814,6 @@ def _get_mm_fields_config( image_sizes=MultiModalFieldConfig.batched("image"), num_img_tokens=MultiModalFieldConfig.batched("image"), input_audio_embeds=MultiModalFieldConfig.batched("audio"), - audio_embed_sizes=MultiModalFieldConfig.batched("audio"), ) def _get_prompt_updates( @@ -982,23 +981,13 @@ def _parse_and_validate_audio_input( Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs. """ audio_features = kwargs.pop("input_audio_embeds", None) - audio_embed_sizes = kwargs.pop("audio_embed_sizes", None) audio_embeds = kwargs.pop("audio_embeds", None) if audio_features is None and audio_embeds is None: return None if audio_features is not None: - assert isinstance(audio_embed_sizes, torch.Tensor) - if isinstance(audio_features, torch.Tensor): - assert audio_features.size(0) == len(audio_embed_sizes), ( - "audio_features and audio_embed_sizes " - "must have the same length") - elif is_list_of(audio_features, (torch.Tensor, list)): - assert len(audio_features) == len(audio_embed_sizes), ( - "audio_features and audio_embed_sizes " - "must have the same length") - else: + if not isinstance(audio_features, (torch.Tensor, list)): raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") From b40b458f0e052a03b74f9578eeef35ef35d3d166 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 6 Apr 2025 12:52:40 +0800 Subject: [PATCH 28/36] add scipy to doc requirement Signed-off-by: Isotr0py <2037008807@qq.com> --- requirements/docs.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/docs.txt b/requirements/docs.txt index 416ca503b36c..99fb87def6dd 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -18,6 +18,7 @@ transformers mistral_common >= 1.5.4 aiohttp starlette +scipy openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args From 4f3049d859ce04dc1c5f2f3e8aadef16ea23de02 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 6 Apr 2025 18:08:29 +0800 Subject: [PATCH 29/36] fix doc build Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/multimodal/audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 0359a1324ef1..70a912c9c9ef 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -6,7 +6,6 @@ import numpy as np import numpy.typing as npt -import scipy.signal from vllm.inputs.registry import InputContext from vllm.utils import PlaceholderModule @@ -59,6 +58,9 @@ def resample_audio_scipy( orig_sr: float, target_sr: float, ): + # lazy import scipy.signal, otherwise it will crash doc build. + import scipy.signal + if orig_sr > target_sr: return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr) elif orig_sr < target_sr: From 6cce3fe2dee0ce901df746baa29f5f5aa541760a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 8 Apr 2025 22:35:53 +0800 Subject: [PATCH 30/36] init vision speech test Signed-off-by: Isotr0py <2037008807@qq.com> --- .../vision_language/test_phi4mm.py | 88 +++++++++++++++++-- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py index 94ddb734b8f2..e1d88c24acb2 100644 --- a/tests/models/decoder_only/vision_language/test_phi4mm.py +++ b/tests/models/decoder_only/vision_language/test_phi4mm.py @@ -2,18 +2,22 @@ import os import re +from collections.abc import Sequence from typing import Optional import pytest from huggingface_hub import snapshot_download from transformers import AutoTokenizer +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput, + PromptImageInput, VllmRunner) from ....utils import large_gpu_test from ...utils import check_logprobs_close @@ -29,6 +33,7 @@ # Since the vision-lora and speech-lora co-exist with the base model, # we have to manually specify the path of the lora weights. vision_lora_path = os.path.join(model_path, "vision-lora") +speech_lora_path = os.path.join(model_path, "speech-lora") models = [model_path] @@ -64,7 +69,8 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str, def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: list[tuple[list[str], PromptImageInput]], + inputs: Sequence[tuple[list[str], PromptImageInput, + Optional[PromptAudioInput]]], model: str, *, max_model_len: int, @@ -105,27 +111,52 @@ def run_test( ) as vllm_model: lora_request = LoRARequest("vision", 1, vision_lora_path) vllm_model.model.llm_engine.add_lora(lora_request=lora_request) + if any(audios is not None for _, _, audios in inputs): + lora_request = LoRARequest("speech", 2, speech_lora_path) + vllm_model.model.llm_engine.add_lora(lora_request=lora_request) vllm_outputs_per_case = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs + images=images, + audios=audios) + for prompts, images, audios in inputs ] # use eager mode for hf runner, since phi3_v didn't work with flash_attn hf_model_kwargs = {"_attn_implementation": "eager"} with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model: - eos_token_id = hf_model.processor.tokenizer.eos_token_id + + hf_processor = hf_model.processor + eos_token_id = hf_processor.tokenizer.eos_token_id + + def patch_hf_processor(*args, + text="", + images=None, + audio=None, + sampling_rate=None, + **kwargs): + audios = None + if audio is not None and sampling_rate is not None: + audios = [(audio, sampling_rate)] + return hf_processor(*args, + text=text, + images=images, + audios=audios, + **kwargs) + + hf_model.processor = patch_hf_processor + hf_outputs_per_case = [ hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, images=images, + audios=audios, eos_token_id=eos_token_id, num_logits_to_keep=0) - for prompts, images in inputs + for prompts, images, audios in inputs ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, @@ -166,6 +197,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, inputs_per_image = [( [prompt for _ in size_factors], [rescale_image_size(image, factor) for factor in size_factors], + None, ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] run_test( @@ -209,9 +241,12 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, images = [asset.pil_image for asset in image_assets] inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) + ( + [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors], + None, + ), ] run_test( @@ -226,3 +261,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, mm_limit=2, tensor_parallel_size=1, ) + + +# FIXME(Isotr0py): This test can't stll pass yet. +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_model_len", [12800]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str, + max_model_len: int, max_tokens: int, + num_logprobs: int) -> None: + + audio = AudioAsset("mary_had_lamb").audio_and_sample_rate + image = ImageAsset("stop_sign").pil_image.convert("RGB") + + inputs_vision_speech = [ + ( + ["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"], + [image], + [audio], + ), + ] + + run_test( + hf_runner, + vllm_runner, + inputs_vision_speech, + model, + dtype=dtype, + max_model_len=max_model_len, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) From a54dae3811624d37b11ee45025f6b504006450cb Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 8 Apr 2025 23:53:40 +0800 Subject: [PATCH 31/36] make vision speech test passed Signed-off-by: Isotr0py <2037008807@qq.com> --- .../models/decoder_only/vision_language/test_phi4mm.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py index e1d88c24acb2..b6bb01e002ac 100644 --- a/tests/models/decoder_only/vision_language/test_phi4mm.py +++ b/tests/models/decoder_only/vision_language/test_phi4mm.py @@ -5,11 +5,11 @@ from collections.abc import Sequence from typing import Optional +import librosa import pytest from huggingface_hub import snapshot_download from transformers import AutoTokenizer -from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.multimodal.image import rescale_image_size @@ -34,6 +34,8 @@ # we have to manually specify the path of the lora weights. vision_lora_path = os.path.join(model_path, "vision-lora") speech_lora_path = os.path.join(model_path, "speech-lora") +speech_question = os.path.join(model_path, "examples", + "what_is_shown_in_this_image.wav") models = [model_path] @@ -263,7 +265,6 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, ) -# FIXME(Isotr0py): This test can't stll pass yet. @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_model_len", [12800]) @@ -273,8 +274,9 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str, max_model_len: int, max_tokens: int, num_logprobs: int) -> None: - audio = AudioAsset("mary_had_lamb").audio_and_sample_rate - image = ImageAsset("stop_sign").pil_image.convert("RGB") + # use the example speech question so that the model outputs are reasonable + audio = librosa.load(speech_question, sr=None) + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") inputs_vision_speech = [ ( From 516d9da20bdfb712b135525cb68789a6424e98df Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 9 Apr 2025 17:55:42 +0800 Subject: [PATCH 32/36] fix ultravox test import Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/models/decoder_only/audio_language/test_ultravox.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index a843e41aa26e..449b93b6fdcc 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -7,7 +7,7 @@ import pytest_asyncio from transformers import AutoModel, AutoTokenizer -from vllm.multimodal.audio import resample_audio +from vllm.multimodal.audio import resample_audio_librosa from vllm.sequence import SampleLogprobs from ....conftest import HfRunner, VllmRunner @@ -135,9 +135,9 @@ def run_test( [hf_prompt], max_tokens, num_logprobs=num_logprobs, - audios=[(resample_audio(audio[0], - orig_sr=audio[1], - target_sr=16000), 16000)]) + audios=[(resample_audio_librosa(audio[0], + orig_sr=audio[1], + target_sr=16000), 16000)]) for _, hf_prompt, audio in prompts_and_audios ] From faa14d5ec7da15d2f08296a577af44ee3fa6fb6e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 9 Apr 2025 14:30:44 +0000 Subject: [PATCH 33/36] Fix online inference Signed-off-by: DarkLight1337 --- vllm/entrypoints/chat_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 11c759a6174e..9ddc6d8d9346 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -480,11 +480,8 @@ def _placeholder_str(self, modality: ModalityStr, if modality in ("image", "image_embeds"): if model_type == "chatglm": return "<|begin_of_image|><|endoftext|><|end_of_image|>" - if model_type == "phi3_v": - # Workaround since this token is not defined in the tokenizer + if model_type in ("phi3_v", "phi4mm"): return f"<|image_{current_count}|>" - if model_type == "phi4mm": - return "<|endoftext10|>" # 200010 (see vocab.json in hf model) if model_type in ("minicpmo", "minicpmv"): return "(./)" if model_type in ("blip-2", "florence2", "fuyu", "paligemma", @@ -516,7 +513,7 @@ def _placeholder_str(self, modality: ModalityStr, if model_type == "ultravox": return "<|audio|>" if model_type == "phi4mm": - return "<|endoftext11|>" # 200011 (see vocab.json in hf model) + return f"<|audio_{current_count}|>" if model_type == "qwen2_audio": return (f"Audio {current_count}: " f"<|audio_bos|><|AUDIO|><|audio_eos|>") From 5ddf5746cab538721d920ed490d528a1338a439d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 14 Apr 2025 22:52:20 +0800 Subject: [PATCH 34/36] expose dynamic_hd Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language.py | 2 + .../vision_language_multi_image.py | 2 + .../multimodal/processing/test_phi4mm.py | 59 +++++++++++++++++++ vllm/model_executor/models/phi3v.py | 2 +- vllm/model_executor/models/phi4mm.py | 36 ++++++++--- 5 files changed, 92 insertions(+), 9 deletions(-) create mode 100644 tests/models/multimodal/processing/test_phi4mm.py diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 20b243fc3ccb..51a44fb21cda 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -793,6 +793,8 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: max_num_seqs=2, enable_lora=True, max_lora_rank=320, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={"dynamic_hd": 16}, ) return ModelRequestData( diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 3547afd3019a..981d15d1415e 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -470,6 +470,8 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: limit_mm_per_prompt={"image": len(image_urls)}, enable_lora=True, max_lora_rank=320, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={"dynamic_hd": 4}, ) placeholders = "".join(f"<|image_{i}|>" diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py new file mode 100644 index 000000000000..797986adba4a --- /dev/null +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for phi4mm's multimodal preprocessing kwargs.""" +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ....conftest import _ImageAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["microsoft/Phi-4-multimodal-instruct"]) +# yapf: disable +@pytest.mark.parametrize( + ("mm_processor_kwargs", "expected_toks_per_img"), + [ + ({"dynamic_hd": 4}, 1329), + ({"dynamic_hd": 16}, 4433), + # the default num_crops of phi-4-multimodal is 36 + ({}, 9585), + ]) +# yapf: enable +@pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("kwargs_on_init", [True, False]) +def test_processor_override( + image_assets: _ImageAssets, + model_id: str, + mm_processor_kwargs: dict[str, int], + expected_toks_per_img: int, + num_imgs: int, + kwargs_on_init: bool, +): + """Ensure Phi4MMMultiModalProcessor handles dynamic_hd properly.""" + # Avoid initializing CUDA early + from vllm.model_executor.models.phi4mm import _IMAGE_PLACEHOLDER_TOKEN_ID + + ctx = build_model_context( + model_id, + mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, + limit_mm_per_prompt={"image": num_imgs}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs + + # Build the image str / prompt based on the number of images we pass + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" + + image_size = ctx.get_hf_config( + ).embd_layer["image_embd_layer"]["crop_size"] + dummy_image_size = (image_size * 7, image_size * 7) + dummy_image = image_assets[0].pil_image.resize(dummy_image_size) + mm_data = {"image": [dummy_image] * num_imgs} + + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = processed_inputs["prompt_token_ids"].count( + _IMAGE_PLACEHOLDER_TOKEN_ID) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7f41ad2359df..5b43871b7591 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -327,7 +327,7 @@ def get_num_image_tokens( *, image_width: int, image_height: int, - processor: Optional[ProcessorMixin], + processor: Optional[ProcessorMixin] = None, ) -> int: if processor is None: processor = self.get_hf_processor() diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 2da55f577b8f..1a56e6826a24 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -6,7 +6,7 @@ import numpy as np import torch import torch.nn as nn -from transformers import (BatchFeature, PretrainedConfig, +from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin, SequenceFeatureExtractor, SiglipVisionConfig) from vllm.config import VllmConfig @@ -48,7 +48,6 @@ SIGLIP_NAME = "siglip-so400m-patch14-448" VISION_ENCODER_TO_PROCESSING_CONFIG = { 'siglip-so400m-patch14-448': { - 'dynamic_hd': 16, 'vit_image_size': 448, 'vit_patch_size': 14, 'token_compression_factor': 2, @@ -470,6 +469,17 @@ def cat_with_pad(tensors, dim, padding_value=0): class Phi4MMProcessingInfo(BaseProcessingInfo): + def get_hf_processor( + self, + *, + dynamic_hd: Optional[int] = None, + **kwargs: object, + ) -> ProcessorMixin: + if dynamic_hd is not None: + kwargs["dynamic_hd"] = dynamic_hd + + return self.ctx.get_hf_processor(**kwargs) + @property def image_tokens(self) -> list[str]: return [f"<|image_{i+1}|>" for i in range(100)] @@ -478,9 +488,13 @@ def image_tokens(self) -> list[str]: def audio_tokens(self) -> list[str]: return [f"<|audio_{i+1}|>" for i in range(100)] - @property - def dynamic_hd(self) -> int: - image_processor = self.get_hf_processor().image_processor + def get_dynamic_hd( + self, + processor: Optional[ProcessorMixin] = None, + ) -> int: + if processor is None: + processor = self.get_hf_processor() + image_processor = processor.image_processor return image_processor.dynamic_hd def get_feature_extractor(self) -> SequenceFeatureExtractor: @@ -632,6 +646,7 @@ def get_num_image_tokens( *, image_width: int, image_height: int, + processor: Optional[ProcessorMixin] = None, ) -> int: hf_config = self.get_hf_config() vision_encoder_name = hf_config.img_processor @@ -643,7 +658,7 @@ def get_num_image_tokens( vit_patch_size = prepro_config['vit_patch_size'] token_compression_factor = prepro_config['token_compression_factor'] - dynamic_hd_size = self.dynamic_hd + dynamic_hd_size = self.get_dynamic_hd(processor=processor) image_num_tokens = self._compute_num_image_tokens( image_width, @@ -656,7 +671,10 @@ def get_num_image_tokens( return image_num_tokens - def get_image_size_with_most_features(self) -> ImageSize: + def get_image_size_with_most_features( + self, + processor: Optional[ProcessorMixin] = None, + ) -> ImageSize: hf_config = self.get_hf_config() vision_encoder_name = hf_config.img_processor if vision_encoder_name is None: @@ -665,7 +683,7 @@ def get_image_size_with_most_features(self) -> ImageSize: vision_encoder_name] vit_image_size = prepro_config['vit_image_size'] - max_side = vit_image_size * self.dynamic_hd + max_side = vit_image_size * self.get_dynamic_hd(processor=processor) return ImageSize(height=max_side, width=vit_image_size) def get_audio_num_frames(self, audio_len: int, sr: float) -> int: @@ -825,6 +843,7 @@ def _get_prompt_updates( image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore feature_extractor = self.info.get_feature_extractor() + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( @@ -837,6 +856,7 @@ def get_image_replacement_phi4mm(item_idx: int): num_image_tokens = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, + processor=hf_processor, ) image_tokens = [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_image_tokens From e9724c89deef3e70e8d1634fe3b7e73949f410d5 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Tue, 15 Apr 2025 13:56:03 +0800 Subject: [PATCH 35/36] reduce max_model_len in example to fit single gpu Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language.py | 2 +- examples/offline_inference/vision_language_multi_image.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 1a1118b386a6..80bf5255f32a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -814,7 +814,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=12800, + max_model_len=5120, max_num_seqs=2, enable_lora=True, max_lora_rank=320, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index a2b57e52eb84..976943dff7a9 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -504,7 +504,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData: engine_args = EngineArgs( model=model_path, trust_remote_code=True, - max_model_len=25600, + max_model_len=4096, max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, enable_lora=True, From 11be4863af3b8deeac2dbf70f130655047d4d131 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sat, 19 Apr 2025 15:08:30 +0800 Subject: [PATCH 36/36] update profiler and fix ultravox tests Signed-off-by: Isotr0py <2037008807@qq.com> --- examples/offline_inference/vision_language.py | 1 + .../audio_language/test_ultravox.py | 19 ++++++++---- vllm/model_executor/models/phi4mm.py | 30 +++++++++++-------- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 6b31eee95af4..bd7035b7615a 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -816,6 +816,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: trust_remote_code=True, max_model_len=5120, max_num_seqs=2, + max_num_batched_tokens=12800, enable_lora=True, max_lora_rank=320, # Note - mm_processor_kwargs can also be passed to generate/chat calls diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index 9adf51015817..e9dcba8ec089 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Optional +from typing import Any, Optional import numpy as np import pytest @@ -43,6 +43,18 @@ def audio(request): return AudioAsset(request.param) +def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: + """Convert kwargs to CLI args.""" + args = [] + for key, value in params_kwargs.items(): + if isinstance(value, bool): + if value: + args.append(f"--{key.replace('_','-')}") + else: + args.append(f"--{key.replace('_','-')}={value}") + return args + + @pytest.fixture(params=[ pytest.param({}, marks=pytest.mark.cpu_model), pytest.param(CHUNKED_PREFILL_KWARGS), @@ -52,10 +64,7 @@ def server(request, audio_assets): "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager", "--limit-mm-per-prompt", json.dumps({"audio": len(audio_assets)}), "--trust-remote-code" - ] + [ - f"--{key.replace('_','-')}={value}" - for key, value in request.param.items() - ] + ] + params_kwargs_to_cli_args(request.param) with RemoteOpenAIServer(MODEL_NAME, args, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 1a56e6826a24..cdd762f5fec3 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -20,15 +20,15 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (AudioProcessorItems, ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate) -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -747,14 +747,26 @@ def _compute_audio_embed_size(self, audio_frames: int) -> int: class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): - def get_dummy_processor_inputs( + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_audios = mm_counts.get("audio", 0) + num_images = mm_counts.get("image", 0) + + image_tokens: list[str] = self.info.image_tokens[:num_images] + audio_tokens: list[str] = self.info.audio_tokens[:num_audios] + + return "".join(image_tokens + audio_tokens) + + def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> ProcessorInputs: + ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) + target_width, target_height = \ + self.info.get_image_size_with_most_features() + target_width, target_height = \ self.info.get_image_size_with_most_features() @@ -768,13 +780,7 @@ def get_dummy_processor_inputs( num_audios=num_audios), } - image_tokens: list[str] = self.info.image_tokens[:num_images] - audio_tokens: list[str] = self.info.audio_tokens[:num_audios] - - return ProcessorInputs( - prompt_text="".join(image_tokens + audio_tokens), - mm_data=mm_data, - ) + return mm_data class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):