From 365c2450af7613066a9bbc760e67e8feecd5c9cb Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Jul 2024 21:29:18 -0700 Subject: [PATCH 01/25] iterate --- vllm/model_executor/models/fuyu.py | 23 ++++++++-- vllm/model_executor/models/llava.py | 41 +++++++++++++----- vllm/model_executor/models/llava_next.py | 52 ++++++++++++++++------- vllm/model_executor/models/paligemma.py | 41 +++++++++++++----- vllm/model_executor/models/phi3v.py | 53 +++++++++++++++++------- vllm/multimodal/image.py | 6 ++- 6 files changed, 161 insertions(+), 55 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index fdea8ee30ce6..66e4b6acb566 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -62,6 +62,14 @@ class FuyuImagePixelInputs(TypedDict): """ +class FuyuImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +FuyuImageInputs = Union[FuyuImagePixelInputs, FuyuImageEmbeddingInputs] + + def _calculate_num_image_tokens( height: int, width: int, @@ -249,6 +257,16 @@ def _parse_and_validate_image_input(self, **kwargs: object): data=image_patches) return None + def _process_image_input(self, + image_input: FuyuImageInputs) -> torch.Tensor: + + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_embed_tokens is not None + vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) + return vision_embeddings + def forward( self, input_ids: torch.Tensor, @@ -261,8 +279,7 @@ def forward( image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: - vision_embeddings, _ = self.vision_embed_tokens( - image_input["data"]) + vision_embeddings = self._process_image_input(image_input) inputs_embeds = self.language_model.model.embed_tokens(input_ids) inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, vision_embeddings, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index b5dddd519219..d7dcd558a8f7 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -59,7 +59,12 @@ class LlavaImagePixelInputs(TypedDict): """Shape: `(batch_size, num_channels, height, width)`""" -LlavaImageInputs = LlavaImagePixelInputs +class LlavaImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] def get_max_llava_image_tokens(ctx: InputContext): @@ -174,18 +179,28 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None: + if pixel_values is None and image_embeds is None: return None - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - return LlavaImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - ) + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + return LlavaImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return LlavaImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: @@ -219,6 +234,10 @@ def _process_image_pixels(self, def _process_image_input(self, image_input: LlavaImageInputs) -> torch.Tensor: + + if image_input["type"] == "image_embeds": + return image_input["data"] + assert self.vision_tower is not None image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 0c89eed88f21..e4583e361ffa 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -59,7 +59,13 @@ class LlavaNextImagePixelInputs(TypedDict): """ -LlavaNextImageInputs = LlavaNextImagePixelInputs +class LlavaNextImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, + LlavaNextImageEmbeddingInputs] # Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91 @@ -187,7 +193,7 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs): input_width=width, ) elif isinstance(image_data, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") + return else: raise TypeError(f"Invalid image type: {type(image_data)}") @@ -285,26 +291,38 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[LlavaNextImagePixelInputs]: + self, **kwargs: object) -> Optional[LlavaNextImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) + image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None: + if pixel_values is None and image_embeds is None: return None - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") - if not isinstance(image_sizes, torch.Tensor): - raise ValueError("Incorrect type of image sizes. " - f"Got type: {type(image_sizes)}") + if not isinstance(image_sizes, torch.Tensor): + raise ValueError("Incorrect type of image sizes. " + f"Got type: {type(image_sizes)}") - return LlavaNextImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - image_sizes=self._validate_image_sizes(image_sizes), - ) + return LlavaNextImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + image_sizes=self._validate_image_sizes(image_sizes), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeds. " + f"Got type: {type(image_embeds)}") + + return LlavaNextImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: @@ -425,6 +443,10 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs) -> BatchedTensors: + + if image_input["type"] == "image_embeds": + return [image_input["data"]] + patch_embeddings = self._process_image_pixels(image_input) image_sizes = image_input.get("image_sizes") diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8a2bacbd96b6..985cf07159b0 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch from PIL import Image @@ -148,7 +148,13 @@ class PaliGemmaImagePixelInputs(TypedDict): """Shape: (batch_size, num_channels, height, width)""" -PaliGemmaImageInputs = PaliGemmaImagePixelInputs +class PaliGemmaImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs, + PaliGemmaImageEmbeddingInputs] @MULTIMODAL_REGISTRY.register_image_input_mapper() @@ -198,18 +204,28 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[PaliGemmaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None: + if pixel_values is None and image_embeds is None: return None - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - return PaliGemmaImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - ) + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + return PaliGemmaImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return PaliGemmaImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) def _image_pixels_to_features(self, vision_tower: SiglipVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: @@ -233,6 +249,9 @@ def _process_image_pixels( def _process_image_input( self, image_input: PaliGemmaImageInputs) -> torch.Tensor: + if image_input["type"] == "pixel_values": + return image_input["data"] + assert self.vision_tower is not None image_features = self._process_image_pixels(image_input) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 8b2c425289f0..4a39cd3087dc 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -152,9 +152,13 @@ def __init__(self, config: PretrainedConfig, wte=None) -> None: self.vocab_size = config.vocab_size self.type_feature = config.img_processor.get('type_feature', 'patch') - def forward(self, input_ids: torch.LongTensor, - pixel_values: torch.FloatTensor, - image_sizes: torch.Tensor) -> torch.FloatTensor: + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor, + image_embeds: torch.Tensor, + ) -> torch.FloatTensor: """process and merge text embeddings with image embeddings.""" # (batch_size, max_num_crops, 3, height, width) @@ -281,6 +285,12 @@ class Phi3VImagePixelInputs(TypedDict): """ +class Phi3VImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """Shape: `(batch_size, num_channels, height, width)`""" + + # Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336): target_height = int(np.ceil(height / padding_unit) * padding_unit) @@ -503,22 +513,37 @@ def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]: pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) + image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None: return None - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - if not isinstance(image_sizes, torch.Tensor): - raise ValueError("Incorrect type of image sizes. " - f"Got type: {type(image_sizes)}") + if pixel_values is None and image_embeds is None: + return None - return Phi3VImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - image_sizes=self._validate_image_sizes(image_sizes)) + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + if not isinstance(image_sizes, torch.Tensor): + raise ValueError("Incorrect type of image sizes. " + f"Got type: {type(image_sizes)}") + + return Phi3VImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + image_sizes=self._validate_image_sizes(image_sizes)) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeds. " + f"Got type: {type(image_embeds)}") + + return Phi3VImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) def forward(self, input_ids: torch.Tensor, diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 3b37ce9149fb..61d18a604818 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -113,6 +113,8 @@ def _get_hf_image_processor(self, model_config: ModelConfig): def _default_input_mapper(self, ctx: InputContext, data: object) -> MultiModalInputs: model_config = ctx.model_config + + # Raw image if isinstance(data, Image.Image): image_processor = self._get_hf_image_processor(model_config) if image_processor is None: @@ -127,8 +129,10 @@ def _default_input_mapper(self, ctx: InputContext, raise return MultiModalInputs(batch_data) + + # Image embedding elif isinstance(data, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") + return MultiModalInputs({"image_embeds": data}) raise TypeError(f"Invalid image type: {type(data)}") From ecb99b380001f46fa06b8d385bd3c46bc4a49f15 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Jul 2024 22:02:23 -0700 Subject: [PATCH 02/25] revert for phi3v --- vllm/model_executor/models/phi3v.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4a39cd3087dc..732cd9351a4a 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -157,7 +157,6 @@ def forward( input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes: torch.Tensor, - image_embeds: torch.Tensor, ) -> torch.FloatTensor: """process and merge text embeddings with image embeddings.""" @@ -535,15 +534,9 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), image_sizes=self._validate_image_sizes(image_sizes)) + # TODO: Enable image embeddings for Phi3-Vision if image_embeds is not None: - if not isinstance(image_embeds, torch.Tensor): - raise ValueError("Incorrect type of image embeds. " - f"Got type: {type(image_embeds)}") - - return Phi3VImageEmbeddingInputs( - type="image_embeds", - data=image_embeds, - ) + raise NotImplementedError("Embeddings input is not supported yet") def forward(self, input_ids: torch.Tensor, From b06cd17cc79bf12e7a81713c24648957200f2a0d Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Jul 2024 22:06:35 -0700 Subject: [PATCH 03/25] revert --- vllm/model_executor/models/phi3v.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 732cd9351a4a..2bbaa19df796 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -152,12 +152,9 @@ def __init__(self, config: PretrainedConfig, wte=None) -> None: self.vocab_size = config.vocab_size self.type_feature = config.img_processor.get('type_feature', 'patch') - def forward( - self, - input_ids: torch.LongTensor, - pixel_values: torch.FloatTensor, - image_sizes: torch.Tensor, - ) -> torch.FloatTensor: + def forward(self, input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + image_sizes: torch.Tensor) -> torch.FloatTensor: """process and merge text embeddings with image embeddings.""" # (batch_size, max_num_crops, 3, height, width) @@ -284,12 +281,6 @@ class Phi3VImagePixelInputs(TypedDict): """ -class Phi3VImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size, num_channels, height, width)`""" - - # Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336): target_height = int(np.ceil(height / padding_unit) * padding_unit) From 0ead2980e2af9b129d7415386a0c1f7583dda468 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Jul 2024 22:30:35 -0700 Subject: [PATCH 04/25] iterate --- vllm/model_executor/models/llava_next.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e4583e361ffa..84c56eaf9ba8 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -193,7 +193,7 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs): input_width=width, ) elif isinstance(image_data, torch.Tensor): - return + image_feature_size = image_data.shape[0] else: raise TypeError(f"Invalid image type: {type(image_data)}") From 356cbcc959ee0937d81cadb0bf70a69f1bed2d56 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 8 Aug 2024 22:31:49 -0700 Subject: [PATCH 05/25] format --- vllm/model_executor/models/llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index bbfa91bb741e..617daeb2208e 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,5 +1,5 @@ import itertools -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union, Union +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch import torch.nn as nn From 94c9455436737bb2053bbb89f7cb47b438ccafcd Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 8 Aug 2024 23:35:19 -0700 Subject: [PATCH 06/25] update --- vllm/model_executor/models/internvl.py | 38 ++++++++++- vllm/model_executor/models/llava.py | 28 ++++---- vllm/model_executor/models/paligemma.py | 51 ++++++--------- vllm/model_executor/models/phi3v.py | 85 ++++++++++++++++++------- 4 files changed, 132 insertions(+), 70 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 49f9a4c85f2d..4b943fdc66a5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -50,6 +50,15 @@ class InternVLImagePixelInputs(TypedDict): """ +class InternVLImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, List[torch.Tensor]] + + +InternVLImageInputs = Union[InternVLImagePixelInputs, + InternVLImageEmbeddingInputs] + + # copied from https://huggingface.co/OpenGVLab/InternVL2-1B def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD @@ -378,13 +387,23 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[InternVLImagePixelInputs]: + self, **kwargs: object) -> Optional[InternVLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_token_id = kwargs.pop("image_token_id", None) + image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None: + if pixel_values is None and image_embeds is None: return None + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return InternVLImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) + self.img_context_token_id = image_token_id[0] if not isinstance(pixel_values, (torch.Tensor, list)): @@ -396,6 +415,19 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), ) + def _process_image_input( + self, + image_input: InternVLImageInputs, + ) -> torch.Tensor: + + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_model is not None + image_embeds = self.extract_feature(image_input["data"]) + + return image_embeds + def forward( self, input_ids: torch.Tensor, @@ -409,7 +441,7 @@ def forward( if image_input is not None: inputs_embeds = self.language_model.model.get_input_embeddings( input_ids) - vit_embeds = self.extract_feature(image_input["data"]) + vit_embeds = self._process_image_input(image_input) inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, vit_embeds, self.img_context_token_id) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 617daeb2208e..1cbfc3dc8846 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -27,6 +27,20 @@ merge_vision_embeddings) +class LlavaImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: `(batch_size, num_channels, height, width)`""" + + +class LlavaImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] + + # TODO(xwjiang): Run benchmark and decide if TP. class LlavaMultiModalProjector(nn.Module): @@ -49,20 +63,6 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -class LlavaImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor - """Shape: `(batch_size, num_channels, height, width)`""" - - -class LlavaImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - - -LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] - - def get_max_llava_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8450062ae22b..fbbbcdb4ecb1 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -31,6 +31,21 @@ } +class PaliGemmaImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +class PaliGemmaImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs, + PaliGemmaImageEmbeddingInputs] + + def get_max_paligemma_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(PaliGemmaConfig) vision_config = hf_config.vision_config @@ -107,21 +122,6 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -class PaliGemmaImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor - """Shape: (batch_size, num_channels, height, width)""" - - -class PaliGemmaImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - - -PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs, - PaliGemmaImageEmbeddingInputs] - - @MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma) @@ -203,29 +203,20 @@ def _image_pixels_to_features( return image_features - def _process_image_pixels( - self, - inputs: PaliGemmaImagePixelInputs, - ) -> torch.Tensor: - assert self.vision_tower is not None - - pixel_values = inputs["data"] - - return self._image_pixels_to_features( - self.vision_tower, - pixel_values, - ) - def _process_image_input( self, image_input: PaliGemmaImageInputs, ) -> torch.Tensor: - if image_input["type"] == "pixel_values": + if image_input["type"] == "image_embeds": return image_input["data"] assert self.vision_tower is not None - image_features = self._process_image_pixels(image_input, ) + pixel_values = image_input["data"] + image_features = self._image_pixels_to_features( + self.vision_tower, + pixel_values, + ) return self.multi_modal_projector(image_features) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index c634b6161f87..6929cf01f63d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -70,6 +70,45 @@ projection_dim=768) +class Phi3VImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + + Note that `num_patches` may be different for each batch, in which case + the data is passed as a list instead of a batched tensor. + """ + + image_sizes: torch.Tensor + """ + Shape: `(batch_size, 2)` + + This should be in `(height, width)` format. + """ + + +class Phi3VImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + + Note that `num_patches` may be different for each batch, in which case + the data is passed as a list instead of a batched tensor. + """ + + image_sizes: torch.Tensor + """ + Shape: `(batch_size, 2)` + + This should be in `(height, width)` format. + """ + + +Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs] + + class Phi3ImageEmbeddingBase(nn.Module): def __init__(self) -> None: @@ -259,24 +298,6 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -class Phi3VImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] - """ - Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` - - Note that `num_patches` may be different for each batch, in which case - the data is passed as a list instead of a batched tensor. - """ - - image_sizes: torch.Tensor - """ - Shape: `(batch_size, 2)` - - This should be in `(height, width)` format. - """ - - # Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336): target_height = int(np.ceil(height / padding_unit) * padding_unit) @@ -496,7 +517,7 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[Phi3VImagePixelInputs]: + self, **kwargs: object) -> Optional[Phi3VImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) image_embeds = kwargs.pop("image_embeds", None) @@ -521,9 +542,28 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), image_sizes=self._validate_image_sizes(image_sizes)) - # TODO: Enable image embeddings for Phi3-Vision if image_embeds is not None: - raise NotImplementedError("Embeddings input is not supported yet") + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Phi3VImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) + + def _process_image_input( + self, + image_input: Phi3VImageInputs, + ) -> torch.Tensor: + + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_embed_tokens is not None + image_embeds = self.vision_embed_tokens(image_input["data"], + image_input["image_sizes"]) + + return image_embeds def forward(self, input_ids: torch.Tensor, @@ -535,8 +575,7 @@ def forward(self, image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: - vision_embeddings = self.vision_embed_tokens( - image_input["data"], image_input["image_sizes"]) + vision_embeddings = self._process_image_input(image_input) inputs_embeds = self.model.get_input_embeddings(input_ids) inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, vision_embeddings, From c6b43f83563775e7dab12757171001789353134b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 8 Aug 2024 23:37:53 -0700 Subject: [PATCH 07/25] update --- vllm/model_executor/models/phi3v.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 6929cf01f63d..f75e5640dade 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -91,19 +91,6 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] - """ - Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` - - Note that `num_patches` may be different for each batch, in which case - the data is passed as a list instead of a batched tensor. - """ - - image_sizes: torch.Tensor - """ - Shape: `(batch_size, 2)` - - This should be in `(height, width)` format. - """ Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs] From 28987a9eb819f8d08cc971e92dac19ae20e48423 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 8 Aug 2024 23:41:10 -0700 Subject: [PATCH 08/25] update blip --- vllm/model_executor/models/blip2.py | 66 +++++++++++++++++++---------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index e00e6c080695..ae5bf7adb830 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -28,6 +28,25 @@ "language_model.model": "language_model", } +# We use this internally as placeholders since there is no image token +# defined on the HuggingFace repo +BLIP2_IMAGE_TOKEN = "" +BLIP2_IMAGE_TOKEN_ID = 50265 + + +class Blip2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +class Blip2ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs] + class Blip2QFormerMultiHeadAttention(nn.Module): @@ -375,20 +394,6 @@ def forward( return sequence_output -class Blip2ImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor - """Shape: (batch_size, num_channels, height, width)""" - - -Blip2ImageInputs = Blip2ImagePixelInputs - -# We use this internally as placeholders since there is no image token -# defined on the HuggingFace repo -BLIP2_IMAGE_TOKEN = "" -BLIP2_IMAGE_TOKEN_ID = 50265 - - def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: return hf_config.num_query_tokens @@ -506,18 +511,29 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[Blip2ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) - if pixel_values is None: + if pixel_values is None and image_embeds is None: return None - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") - return Blip2ImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - ) + return Blip2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return Blip2ImageEmbeddingInputs( + type="image_embeds", + data=image_embeds, + ) def _image_pixels_to_features(self, vision_model: BlipVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: @@ -538,6 +554,10 @@ def _process_image_pixels(self, def _process_image_input(self, image_input: Blip2ImageInputs) -> torch.Tensor: + + if image_input["type"] == "image_embeds": + return image_input["data"] + assert self.vision_model is not None image_features = self._process_image_pixels(image_input) From 271be65903f005233cb10d9052edc119f5db51f6 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 00:40:15 -0700 Subject: [PATCH 09/25] rename variable --- vllm/model_executor/models/internvl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 4b943fdc66a5..c054d6150a0d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -441,9 +441,9 @@ def forward( if image_input is not None: inputs_embeds = self.language_model.model.get_input_embeddings( input_ids) - vit_embeds = self._process_image_input(image_input) + vision_embeddings = self._process_image_input(image_input) inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, - vit_embeds, + vision_embeddings, self.img_context_token_id) input_ids = None else: From 3422dfc032da6ea55730b0a74d09530b4ab2a5cf Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:04:16 -0700 Subject: [PATCH 10/25] fix yapf --- tests/tracing/test_tracing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index a492daf3b49c..90f26400952b 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -114,5 +114,5 @@ def test_traces(trace_service): SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time - assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER - ) == metrics.scheduler_time + assert attributes.get( + SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time From 06f8f18711d9e0253e9abcceb9c169fdf5a5e1b9 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:17:37 -0700 Subject: [PATCH 11/25] allow embed inputs for clip & siglip --- vllm/model_executor/models/clip.py | 8 +++++++- vllm/model_executor/models/siglip.py | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 805ade39389d..8ec72eeb14e5 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -88,7 +88,13 @@ def input_processor_for_clip( tokenizer = cached_get_tokenizer(model_config.tokenizer) if image_feature_size_override is None: - image_feature_size = get_clip_image_feature_size(hf_config) + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_feature_size = get_clip_image_feature_size(hf_config) + elif isinstance(image_data, torch.Tensor): + image_feature_size = image_data.shape[0] + else: + raise TypeError(f"Invalid image type: {type(image_data)}") else: image_feature_size = image_feature_size_override diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 5ba14f73394f..afe57bf573ad 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -97,7 +97,13 @@ def input_processor_for_siglip( tokenizer = cached_get_tokenizer(model_config.tokenizer) if image_feature_size_override is None: - image_feature_size = get_siglip_image_feature_size(hf_config) + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_feature_size = get_siglip_image_feature_size(hf_config) + elif isinstance(image_data, torch.Tensor): + image_feature_size = image_data.shape[0] + else: + raise TypeError(f"Invalid image type: {type(image_data)}") else: image_feature_size = image_feature_size_override From ca99e19eb08154d7f08d34b6745e841c2498caaf Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:33:03 -0700 Subject: [PATCH 12/25] support phi3v embed input --- vllm/model_executor/models/phi3v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index f75e5640dade..9ec9a042134f 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -400,7 +400,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs): input_width=w, input_height=h) elif isinstance(image_data, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") + image_feature_size = image_data.shape[0] else: raise TypeError(f"Invalid image type: {type(image_data)}") From c35004e3b70ecf3322e1f7908c05c8a1b775895e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:33:34 -0700 Subject: [PATCH 13/25] revert fuyu embed input --- vllm/model_executor/models/fuyu.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 82cdf863766c..7a0be7f8fb0f 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn @@ -62,12 +62,7 @@ class FuyuImagePixelInputs(TypedDict): """ -class FuyuImageEmbeddingInputs(TypedDict): - type: Literal["image_embeds"] - data: torch.Tensor - - -FuyuImageInputs = Union[FuyuImagePixelInputs, FuyuImageEmbeddingInputs] +FuyuImageInputs = FuyuImagePixelInputs def _calculate_num_image_tokens( From 19ef980e946e02d4c0132856c60c935e6552ee6b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:40:13 -0700 Subject: [PATCH 14/25] update for internvl --- vllm/model_executor/models/internvl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index c054d6150a0d..fc32865ac774 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -202,8 +202,10 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): # add thumbnail image if num_blocks > 1 if hf_config.use_thumbnail and num_blocks > 1: num_blocks += 1 + + image_feature_size = num_blocks * num_patches elif isinstance(image_data, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") + raise image_feature_size.shape[0] else: raise TypeError(f"Invalid image type: {type(image_data)}") @@ -214,7 +216,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): prompt_token_ids = llm_inputs["prompt_token_ids"] if prompt is None: prompt = tokenizer.decode(prompt_token_ids) - image_prompt = IMG_START + IMG_CONTEXT * num_blocks * num_patches + IMG_END + image_prompt = IMG_START + IMG_CONTEXT * image_feature_size + IMG_END new_prompt = prompt.replace('', image_prompt, 1) new_prompt_token_ids = tokenizer.encode(new_prompt) From 2bd0b399f7180a0fa3d564c4f087497e6ac273fa Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:41:19 -0700 Subject: [PATCH 15/25] fix typo --- vllm/model_executor/models/internvl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index fc32865ac774..e0661a23cb4f 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -205,7 +205,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): image_feature_size = num_blocks * num_patches elif isinstance(image_data, torch.Tensor): - raise image_feature_size.shape[0] + image_feature_size = image_data.shape[0] else: raise TypeError(f"Invalid image type: {type(image_data)}") From 04a56360169a47da8fba42861136dc3d665ccb94 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:42:40 -0700 Subject: [PATCH 16/25] format --- vllm/model_executor/models/internvl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index e0661a23cb4f..0d64d065e83f 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -202,8 +202,8 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): # add thumbnail image if num_blocks > 1 if hf_config.use_thumbnail and num_blocks > 1: num_blocks += 1 - image_feature_size = num_blocks * num_patches + elif isinstance(image_data, torch.Tensor): image_feature_size = image_data.shape[0] else: From af2d7c5d99e1cf1d2c2adfa49ab3d9f46953976e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:53:18 -0700 Subject: [PATCH 17/25] revert fuyu --- vllm/model_executor/models/fuyu.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7a0be7f8fb0f..376eac566ff9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -255,9 +255,6 @@ def _parse_and_validate_image_input(self, **kwargs: object): def _process_image_input(self, image_input: FuyuImageInputs) -> torch.Tensor: - if image_input["type"] == "image_embeds": - return image_input["data"] - assert self.vision_embed_tokens is not None vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) return vision_embeddings From 59f3729dbd768a8739e0bf5bd14253d425ebec96 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 9 Aug 2024 23:54:39 -0700 Subject: [PATCH 18/25] typing --- vllm/model_executor/models/fuyu.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 376eac566ff9..bb49349e7954 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -62,9 +62,6 @@ class FuyuImagePixelInputs(TypedDict): """ -FuyuImageInputs = FuyuImagePixelInputs - - def _calculate_num_image_tokens( height: int, width: int, @@ -237,7 +234,8 @@ def __init__(self, cache_config=cache_config, quant_config=quant_config) - def _parse_and_validate_image_input(self, **kwargs: object): + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: image_patches = kwargs.pop("image_patches", None) if isinstance(image_patches, torch.Tensor): @@ -252,8 +250,8 @@ def _parse_and_validate_image_input(self, **kwargs: object): data=image_patches) return None - def _process_image_input(self, - image_input: FuyuImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: FuyuImagePixelInputs) -> torch.Tensor: assert self.vision_embed_tokens is not None vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) From 93a5bfd56aa2e395f3d09acd532a0d3ab3aec786 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 10 Aug 2024 00:09:37 -0700 Subject: [PATCH 19/25] adding guard for fuyu --- vllm/model_executor/models/fuyu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index bb49349e7954..46a19f13a39e 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -200,6 +200,8 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): image_patch[0] for image_patch in model_image_input["image_patches"] ]) + else: + raise TypeError(f"Invalid image type: {type(data)}") # image has been processed with prompt in input processor return MultiModalInputs({"image_patches": data}) From f024985f0767e7f132acc9699399d86ff525ef3f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 10 Aug 2024 00:24:20 -0700 Subject: [PATCH 20/25] add assertion guard --- vllm/model_executor/models/blip2.py | 2 ++ vllm/model_executor/models/internvl.py | 19 +++++++++++-------- vllm/model_executor/models/llava.py | 2 ++ vllm/model_executor/models/llava_next.py | 2 ++ vllm/model_executor/models/paligemma.py | 2 ++ vllm/model_executor/models/phi3v.py | 2 ++ 6 files changed, 21 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index ae5bf7adb830..c3e6343957ed 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -535,6 +535,8 @@ def _parse_and_validate_image_input( data=image_embeds, ) + raise AssertionError("This line should be unreachable.") + def _image_pixels_to_features(self, vision_model: BlipVisionModel, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 0d64d065e83f..5ba4638aaf07 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -408,14 +408,17 @@ def _parse_and_validate_image_input( self.img_context_token_id = image_token_id[0] - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - return InternVLImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values(pixel_values), - ) + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return InternVLImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + raise AssertionError("This line should be unreachable.") def _process_image_input( self, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 1cbfc3dc8846..64402ae90992 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -238,6 +238,8 @@ def _parse_and_validate_image_input( data=image_embeds, ) + raise AssertionError("This line should be unreachable.") + def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index f6edeeafeb50..868548d77305 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -359,6 +359,8 @@ def _parse_and_validate_image_input( data=image_embeds, ) + raise AssertionError("This line should be unreachable.") + def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index fbbbcdb4ecb1..104d9375cf2f 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -192,6 +192,8 @@ def _parse_and_validate_image_input( data=image_embeds, ) + raise AssertionError("This line should be unreachable.") + def _image_pixels_to_features( self, vision_tower: SiglipVisionModel, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 9ec9a042134f..1da4b3d9d26e 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -538,6 +538,8 @@ def _parse_and_validate_image_input( data=image_embeds, ) + raise AssertionError("This line should be unreachable.") + def _process_image_input( self, image_input: Phi3VImageInputs, From dd88741142e3456990b3e6e73724252b191bb818 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 11 Aug 2024 15:59:18 -0700 Subject: [PATCH 21/25] update docstring --- vllm/model_executor/models/blip2.py | 4 ++++ vllm/model_executor/models/internvl.py | 4 ++++ vllm/model_executor/models/llava.py | 4 ++++ vllm/model_executor/models/llava_next.py | 4 ++++ vllm/model_executor/models/paligemma.py | 4 ++++ vllm/model_executor/models/phi3v.py | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c3e6343957ed..084cbf35533b 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -43,6 +43,10 @@ class Blip2ImagePixelInputs(TypedDict): class Blip2ImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ Blip2ImageInputs = Union[Blip2ImagePixelInputs, Blip2ImageEmbeddingInputs] diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 5ba4638aaf07..26c02d46a18e 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -53,6 +53,10 @@ class InternVLImagePixelInputs(TypedDict): class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ InternVLImageInputs = Union[InternVLImagePixelInputs, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 64402ae90992..0ff68943b510 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -36,6 +36,10 @@ class LlavaImagePixelInputs(TypedDict): class LlavaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 868548d77305..d94af966162f 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -63,6 +63,10 @@ class LlavaNextImagePixelInputs(TypedDict): class LlavaNextImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 104d9375cf2f..c6d59db643bb 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -40,6 +40,10 @@ class PaliGemmaImagePixelInputs(TypedDict): class PaliGemmaImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ PaliGemmaImageInputs = Union[PaliGemmaImagePixelInputs, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 1da4b3d9d26e..1ed0da4d7dc9 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -91,6 +91,10 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: Union[torch.Tensor, List[torch.Tensor]] + """Shape: `(batch_size, image_feature_size, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + """ Phi3VImageInputs = Union[Phi3VImagePixelInputs, Phi3VImageEmbeddingInputs] From 8974b2200672444199e8e3c0c48d79c9a49a4354 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 11 Aug 2024 20:08:06 -0700 Subject: [PATCH 22/25] add embedding tests --- tests/models/test_llava_image_embeds.py | 149 ++++++++++++++++++++++++ vllm/assets/image.py | 10 ++ 2 files changed, 159 insertions(+) create mode 100644 tests/models/test_llava_image_embeds.py diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py new file mode 100644 index 000000000000..429ad249770d --- /dev/null +++ b/tests/models/test_llava_image_embeds.py @@ -0,0 +1,149 @@ +from typing import List, Optional, Tuple, Type + +import pytest +from transformers import AutoConfig, AutoTokenizer + +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "USER: \nWhat's the content of the image?\nASSISTANT:", + "cherry_blossom": + "USER: \nWhat is the season?\nASSISTANT:", +}) + +models = [ + "llava-hf/llava-1.5-7b-hf", +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + config = AutoConfig.from_pretrained(model) + image_token_id = config.image_token_index + + tokenizer = AutoTokenizer.from_pretrained(model) + eos_token_id = tokenizer.eos_token_id + + hf_output_ids = [ + token_id for idx, token_id in enumerate(output_ids) + if token_id != image_token_id or output_ids[idx - 1] != image_token_id + ] + + assert output_str[0] == " " + hf_output_str = output_str[1:] + if hf_output_ids[-1] == eos_token_id: + hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) + + return hf_output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.image_embeds for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [image for _ in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + # TODO: Check whether using original CLIPVisionModel can improve + # consistency against HF + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 376150574e3b..5eec78c32890 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from typing import Literal +import torch from PIL import Image from vllm.assets.base import get_vllm_public_assets @@ -18,3 +19,12 @@ def pil_image(self) -> Image.Image: image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", s3_prefix=VLM_IMAGES_DIR) return Image.open(image_path) + + @property + def image_embeds(self) -> torch.Tensor: + """ + Image embeddings, only used for testing purposes with llava 1.5. + """ + image_path = get_vllm_public_assets(filename=f"{self.name}.pt", + s3_prefix=VLM_IMAGES_DIR) + return torch.load(image_path) From 28c9e83e4ca4bb947e996591db811547e9af5c7b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 11 Aug 2024 21:51:13 -0700 Subject: [PATCH 23/25] update test --- tests/models/test_llava_image_embeds.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/models/test_llava_image_embeds.py b/tests/models/test_llava_image_embeds.py index 429ad249770d..63ccd1f6625c 100644 --- a/tests/models/test_llava_image_embeds.py +++ b/tests/models/test_llava_image_embeds.py @@ -69,12 +69,22 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - images = [asset.image_embeds for asset in image_assets] - inputs_per_image = [( + # vLLM to load from image embeddings + vllm_images = [asset.image_embeds for asset in image_assets] + + # transformers to load from PIL images + hf_images = [asset.pil_image for asset in image_assets] + + vllm_inputs_per_image = [( + [prompt for _ in size_factors], + [image for _ in size_factors], + ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] + + hf_inputs_per_image = [( [prompt for _ in size_factors], [image for _ in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -92,7 +102,7 @@ def run_test( max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in vllm_inputs_per_image ] with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: @@ -101,7 +111,7 @@ def run_test( max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in hf_inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, From 2f4539e2a248a8b376ce43e0e27e99dcc27fd81c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 11 Aug 2024 22:09:21 -0700 Subject: [PATCH 24/25] update doc --- docs/source/models/vlm.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a385605c9f8f..236e37b51d47 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -49,6 +49,17 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI "multi_modal_data": {"image": image}, }) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Inference with image embeddings as input + image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) + for o in outputs: generated_text = o.outputs[0].text print(generated_text) From 307d705d7ba6289d904195acf2fc38470f2f0f8b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 11 Aug 2024 23:31:45 -0700 Subject: [PATCH 25/25] revert fuyu changes --- vllm/model_executor/models/fuyu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 46a19f13a39e..bb49349e7954 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -200,8 +200,6 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): image_patch[0] for image_patch in model_image_input["image_patches"] ]) - else: - raise TypeError(f"Invalid image type: {type(data)}") # image has been processed with prompt in input processor return MultiModalInputs({"image_patches": data})