From 7c1e0d323e0a71e783ea036f39832b03e943cacc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 04:12:38 +0000
Subject: [PATCH 01/12] Support E5-V

---
 docs/source/models/supported_models.rst       |   6 +
 examples/offline_inference_vision_language.py |   6 +-
 ...ine_inference_vision_language_embedding.py | 147 +++++++++++++++---
 ...e_inference_vision_language_multi_image.py |   7 +-
 tests/conftest.py                             |  51 +++---
 .../vision_language/test_llava_next.py        |  76 +++++++++
 .../embedding/vision_language/test_phi3v.py   |  25 ++-
 vllm/model_executor/models/llava_next.py      |  15 +-
 vllm/model_executor/models/registry.py        |   1 +
 9 files changed, 277 insertions(+), 57 deletions(-)
 create mode 100644 tests/models/embedding/vision_language/test_llava_next.py
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 62ab8c067f5d..3c510d2f2ada 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -484,6 +484,12 @@ Multimodal Embedding
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT-based
+    - T + I
+    - :code:`royokong/e5-v-2`, :code:`royokong/e5-v`
+    - 
+    - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
     - T + I
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 06b424abd50b..610cc31db9c4 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -1,6 +1,6 @@
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
@@ -450,7 +450,7 @@ def main(args):
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
-        'vision language models')
+        'vision language models for text generation')
     parser.add_argument('--model-type',
                         '-m',
                         type=str,
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index cfedd145a015..d5b98ff3c183 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -1,22 +1,127 @@
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from argparse import Namespace
+from typing import List, NamedTuple, Optional, Union
+
+from PIL.Image import Image
+
 from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
-
-# Create an LLM.
-llm = LLM(
-    model="TIGER-Lab/VLM2Vec-Full",
-    task="embedding",
-    trust_remote_code=True,
-    max_model_len=4096,
-    max_num_seqs=2,
-    mm_processor_kwargs={"num_crops": 16},
-)
-
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
-
-# Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 3072 floats
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    stop_token_ids: Optional[List[str]]
+    image: Optional[Image]
+
+
+def run_e5_v(text_or_image: Union[str, Image]):
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+
+    if isinstance(text_or_image, str):
+        prompt = llama3_template.format(
+            f"{text_or_image}\nSummary above sentence in one word: ")
+        image = None
+    else:
+        prompt = llama3_template.format(
+            "<image>\nSummary above image in one word: ")
+        image = text_or_image
+
+    llm = LLM(
+        model="royokong/e5-v-2",
+        task="embedding",
+    )
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image=image,
+    )
+
+
+def run_vlm2vec(text_or_image: Union[str, Image]):
+    if isinstance(text_or_image, str):
+        prompt = f"Find me an everyday image that matches the given caption: {text_or_image}"  # noqa: E501
+        image = None
+    else:
+        prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
+        image = text_or_image
+
+    llm = LLM(
+        model="TIGER-Lab/VLM2Vec-Full",
+        task="embedding",
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+    )
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image=image,
+    )
+
+
+def get_text_or_image(modality: str):
+    if modality == "text":
+        return "A dog sitting in the grass"
+
+    if modality == "image":
+        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"
+        return fetch_image(image_url)
+
+    msg = f"Modality {modality} is not supported."
+    raise ValueError(msg)
+
+
+def run_encode(model: str, modality: str):
+    text_or_image = get_text_or_image(modality)
+    req_data = model_example_map[model](text_or_image)
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = req_data.llm.encode(
+        {
+            "prompt": req_data.prompt,
+            "multi_modal_data": {
+                "image": req_data.image
+            },
+        }, )
+
+    for output in outputs:
+        print(output.outputs.embedding)
+
+
+def main(args: Namespace):
+    run_encode(args.model, args.modality)
+
+
+model_example_map = {
+    "e5_v": run_e5_v,
+    "vlm2vec": run_vlm2vec,
+}
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for multimodal embedding')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="vlm2vec",
+                        choices=model_example_map.keys(),
+                        help='The name of the embedding model.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['text', 'image'],
+                        help='Modality of the input.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 69f590fb7950..e28514bf403f 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -1,7 +1,7 @@
 """
 This example shows how to use vLLM for running offline inference with
-multi-image input on vision language models, using the chat template defined
-by the model.
+multi-image input on vision language models for text generation,
+using the chat template defined by the model.
 """
 from argparse import Namespace
 from typing import List, NamedTuple, Optional
@@ -334,7 +334,8 @@ def main(args: Namespace):
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input')
+        'vision language models that support multi-image input for text '
+        'generation')
     parser.add_argument('--model-type',
                         '-m',
                         type=str,
diff --git a/tests/conftest.py b/tests/conftest.py
index 4c9180415da3..01ae8187d60b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -42,10 +42,12 @@
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
-PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
-PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
-                         List[List[Tuple[np.ndarray, int]]]]
-PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
+_M = TypeVar("_M")
+_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -316,12 +318,12 @@ def get_inputs(
                 "text": prompt,
                 "return_tensors": "pt",
             }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-            if videos is not None and videos[i] is not None:
-                processor_kwargs["videos"] = videos[i]
-            if audios is not None and audios[i] is not None:
-                audio, sr = audios[i]
+            if images is not None and (image := images[i]) is not None:
+                processor_kwargs["images"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                processor_kwargs["videos"] = video
+            if audios is not None and (audio_tuple := audios[i]) is not None:
+                audio, sr = audio_tuple
                 processor_kwargs["audio"] = audio
                 processor_kwargs["sampling_rate"] = sr
 
@@ -336,7 +338,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
@@ -366,7 +368,7 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
@@ -407,7 +409,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
@@ -486,7 +488,7 @@ def generate_greedy_logprobs_limit(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
     ) -> List[TokensTextLogprobs]:
         all_inputs = self.get_inputs(prompts,
@@ -835,13 +837,20 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
-    def encode(self, prompts: List[str]) -> List[List[float]]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
+    def encode(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[List[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.encode(inputs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
 
     def __enter__(self):
         return self
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
new file mode 100644
index 000000000000..6e5e635ca451
--- /dev/null
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -0,0 +1,76 @@
+import pytest
+import torch.nn.functional as F
+
+from ....conftest import IMAGE_ASSETS
+from ..utils import check_embeddings_close
+
+llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+
+HF_TEXT_PROMPTS = [
+    llama3_template.format(
+        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
+    ),
+    llama3_template.format(
+        "cherry blossom\nSummary above sentence in one word: "),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+    "cherry_blossom":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+})
+
+MODELS = ["royokong/e5-v-2"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        *((text, None) for text in HF_TEXT_PROMPTS),
+        *((text, image)
+          for text, image in zip(HF_IMAGE_PROMPTS, image_assets)),
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model, task="embedding", dtype=dtype,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            # Based on: https://huggingface.co/royokong/e5-v
+            outputs = hf_model.model(
+                **hf_model.wrap_device(inputs,
+                                       device=hf_model.model.device.type),
+                return_dict=True,
+                output_hidden_states=True,
+            )
+            pooled_output = F.normalize(outputs.hidden_states[-1][:, -1, :],
+                                        dim=-1)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 0ca90e6bfa52..1dd4e18325dd 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -4,6 +4,11 @@
 from ....conftest import IMAGE_ASSETS
 from ..utils import check_embeddings_close
 
+HF_TEXT_PROMPTS = [
+    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
+    "Retrieve an image of this caption: cherry blossom",
+]
+
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
     "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
@@ -19,24 +24,28 @@
 def test_models(
     hf_runner,
     vllm_runner,
-    example_prompts,
+    image_assets,
     model: str,
     dtype: str,
 ) -> None:
+    input_texts_images = [
+        *((text, None) for text in HF_TEXT_PROMPTS),
+        *((text, image)
+          for text, image in zip(HF_IMAGE_PROMPTS, image_assets)),
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     task="embedding",
-                     max_model_len=4096,
-                     max_num_seqs=2,
-                     dtype=dtype,
+    with vllm_runner(model, task="embedding", dtype=dtype,
                      enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
     with hf_runner(model, dtype=dtype) as hf_model:
-        all_inputs = hf_model.get_inputs(example_prompts)
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []
         for inputs in all_inputs:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 4dd472b04bb1..eef6bda109a9 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -13,11 +13,13 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
 from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import is_list_of
 
 from .clip import (CLIPVisionModel, dummy_image_for_clip,
@@ -312,6 +314,10 @@ def __init__(self,
         self.language_model = init_vllm_registered_model(
             config.text_config, cache_config, quant_config)
 
+        # The same model class supports both language generation and embedding
+        # because the architecture name is the same
+        self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
@@ -641,6 +647,13 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2a04ece24c8b..00a4164af46c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -93,6 +93,7 @@
     "MistralModel": ("llama", "LlamaEmbeddingModel"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     # [Multimodal]
+    "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
 }
 

From 269aa6e895de989218475e2e3a606937323637bc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 05:34:34 +0000
Subject: [PATCH 02/12] Improve modality coverage; fix bad model

---
 docs/source/models/supported_models.rst       |  6 +-
 ...ine_inference_vision_language_embedding.py | 90 ++++++++++++++-----
 .../vision_language/test_llava_next.py        |  6 +-
 .../embedding/vision_language/test_phi3v.py   |  6 +-
 4 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 3c510d2f2ada..d59385437cca 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -486,13 +486,13 @@ Multimodal Embedding
     - :ref:`PP <distributed_serving>`
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT-based
-    - T + I
-    - :code:`royokong/e5-v-2`, :code:`royokong/e5-v`
+    - T / I
+    - :code:`royokong/e5-v`
     - 
     - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
-    - T + I
+    - T / I / T + I
     - :code:`TIGER-Lab/VLM2Vec-Full`
     - 🚧
     - ✅︎
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index d5b98ff3c183..9f5892abc17d 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -6,7 +6,7 @@
 on HuggingFace model repository.
 """
 from argparse import Namespace
-from typing import List, NamedTuple, Optional, Union
+from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
 
 from PIL.Image import Image
 
@@ -15,45 +15,75 @@
 from vllm.utils import FlexibleArgumentParser
 
 
+class TextQuery(TypedDict):
+    modality: Literal["text"]
+    text: str
+
+
+class ImageQuery(TypedDict):
+    modality: Literal["image"]
+    image: Image
+
+
+class TextImageQuery(TypedDict):
+    modality: Literal["text+image"]
+    text: str
+    image: Image
+
+
+QueryModality = Literal["text", "image", "text+image"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery]
+
+
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[str]]
     image: Optional[Image]
 
 
-def run_e5_v(text_or_image: Union[str, Image]):
+def run_e5_v(query: Query):
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
 
-    if isinstance(text_or_image, str):
+    if query["modality"] == "text":
+        text = query["text"]
         prompt = llama3_template.format(
-            f"{text_or_image}\nSummary above sentence in one word: ")
+            f"{text}\nSummary above sentence in one word: ")
         image = None
-    else:
+    elif query["modality"] == "image":
         prompt = llama3_template.format(
             "<image>\nSummary above image in one word: ")
-        image = text_or_image
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
 
     llm = LLM(
-        model="royokong/e5-v-2",
+        model="royokong/e5-v",
         task="embedding",
     )
 
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
-        stop_token_ids=None,
         image=image,
     )
 
 
-def run_vlm2vec(text_or_image: Union[str, Image]):
-    if isinstance(text_or_image, str):
-        prompt = f"Find me an everyday image that matches the given caption: {text_or_image}"  # noqa: E501
+def run_vlm2vec(query: Query):
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
         image = None
+    elif query["modality"] == "image":
+        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        image = query["image"]
     else:
-        prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
-        image = text_or_image
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
 
     llm = LLM(
         model="TIGER-Lab/VLM2Vec-Full",
@@ -65,26 +95,38 @@ def run_vlm2vec(text_or_image: Union[str, Image]):
     return ModelRequestData(
         llm=llm,
         prompt=prompt,
-        stop_token_ids=None,
         image=image,
     )
 
 
-def get_text_or_image(modality: str):
+def get_query(modality: QueryModality):
     if modality == "text":
-        return "A dog sitting in the grass"
+        return TextQuery(modality="text", text="A dog sitting in the grass")
 
     if modality == "image":
-        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"
-        return fetch_image(image_url)
+        return ImageQuery(
+            modality="image",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+            ),
+        )
+
+    if modality == "text+image":
+        return TextImageQuery(
+            modality="text+image",
+            text="A cat standing in the snow.",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+            ),
+        )
 
     msg = f"Modality {modality} is not supported."
     raise ValueError(msg)
 
 
-def run_encode(model: str, modality: str):
-    text_or_image = get_text_or_image(modality)
-    req_data = model_example_map[model](text_or_image)
+def run_encode(model: str, modality: QueryModality):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
     outputs = req_data.llm.encode(
@@ -112,7 +154,7 @@ def main(args: Namespace):
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
         'vision language models for multimodal embedding')
-    parser.add_argument('--model-type',
+    parser.add_argument('--model-name',
                         '-m',
                         type=str,
                         default="vlm2vec",
@@ -121,7 +163,7 @@ def main(args: Namespace):
     parser.add_argument('--modality',
                         type=str,
                         default="image",
-                        choices=['text', 'image'],
+                        choices=get_args(QueryModality),
                         help='Modality of the input.')
     args = parser.parse_args()
     main(args)
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 6e5e635ca451..e0baf4b4af32 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -7,21 +7,25 @@
 llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
 
 HF_TEXT_PROMPTS = [
+    # T -> X
     llama3_template.format(
         "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
     ),
+    # T -> X
     llama3_template.format(
         "cherry blossom\nSummary above sentence in one word: "),
 ]
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # I -> X
     "stop_sign":
     llama3_template.format("<image>\nSummary above image in one word: "),
+    # I -> X
     "cherry_blossom":
     llama3_template.format("<image>\nSummary above image in one word: "),
 })
 
-MODELS = ["royokong/e5-v-2"]
+MODELS = ["royokong/e5-v"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 1dd4e18325dd..1f5773c3d180 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -5,15 +5,19 @@
 from ..utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
+    # T -> X
     "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
+    # T -> X
     "Retrieve an image of this caption: cherry blossom",
 ]
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # T + I -> X
     "stop_sign":
     "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
+    # I -> X
     "cherry_blossom":
-    "<|image_1|> Represent the given image with the following question: What is in the image",  # noqa: E501
+    "<|image_1|> Represent the given image for classification",  # noqa: E501
 })
 
 MODELS = ["TIGER-Lab/VLM2Vec-Full"]

From 624365fe79b15d22062889b71e6d5f11cc9f3972 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 05:35:35 +0000
Subject: [PATCH 03/12] Fix test

---
 tests/conftest.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 01ae8187d60b..e48dc1d48c3a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -657,15 +657,18 @@ def get_inputs(
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = {"image": image}
+                if image is not None:
+                    inputs[i]["multi_modal_data"] = {"image": image}
 
         if videos is not None:
             for i, video in enumerate(videos):
-                inputs[i]["multi_modal_data"] = {"video": video}
+                if video is not None:
+                    inputs[i]["multi_modal_data"] = {"video": video}
 
         if audios is not None:
             for i, audio in enumerate(audios):
-                inputs[i]["multi_modal_data"] = {"audio": audio}
+                if audio is not None:
+                    inputs[i]["multi_modal_data"] = {"audio": audio}
 
         return inputs
 

From a1b1d95c811e9a1fec7121bdcdb38feadd052836 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 06:17:14 +0000
Subject: [PATCH 04/12] Fix

---
 tests/models/embedding/vision_language/test_llava_next.py | 4 ++--
 tests/models/embedding/vision_language/test_phi3v.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index e0baf4b4af32..a1a4c1927b89 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -39,8 +39,8 @@ def test_models(
 ) -> None:
     input_texts_images = [
         *((text, None) for text in HF_TEXT_PROMPTS),
-        *((text, image)
-          for text, image in zip(HF_IMAGE_PROMPTS, image_assets)),
+        *((text, asset.pil_image)
+          for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)),
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 1f5773c3d180..71241ddd1f94 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -34,8 +34,8 @@ def test_models(
 ) -> None:
     input_texts_images = [
         *((text, None) for text in HF_TEXT_PROMPTS),
-        *((text, image)
-          for text, image in zip(HF_IMAGE_PROMPTS, image_assets)),
+        *((text, asset.pil_image)
+          for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)),
     ]
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]

From df8e1b99ca10d90d42b2aafc399b07bded0ea806 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 06:59:49 +0000
Subject: [PATCH 05/12] Fix tests

---
 examples/offline_inference_vision_language_embedding.py   | 1 +
 tests/models/embedding/vision_language/test_llava_next.py | 5 ++++-
 tests/models/embedding/vision_language/test_phi3v.py      | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index 9f5892abc17d..1c1529559381 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -60,6 +60,7 @@ def run_e5_v(query: Query):
     llm = LLM(
         model="royokong/e5-v",
         task="embedding",
+        max_model_len=4096,
     )
 
     return ModelRequestData(
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index a1a4c1927b89..1fd85b733ce8 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -49,7 +49,10 @@ def test_models(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embedding", dtype=dtype,
+    with vllm_runner(model,
+                     task="embedding",
+                     dtype=dtype,
+                     max_model_len=4096,
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 71241ddd1f94..e913123995eb 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -48,7 +48,10 @@ def test_models(
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []

From c25578a98ea338d6bab8a074e650f1ea73c7564f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 12:12:29 +0000
Subject: [PATCH 06/12] Fix example

---
 ...line_inference_vision_language_embedding.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
index 1c1529559381..e1732d045f94 100644
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
@@ -129,21 +129,21 @@ def run_encode(model: str, modality: QueryModality):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
-    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = req_data.llm.encode(
-        {
-            "prompt": req_data.prompt,
-            "multi_modal_data": {
-                "image": req_data.image
-            },
-        }, )
+    mm_data = {}
+    if req_data.image is not None:
+        mm_data["image"] = req_data.image
+
+    outputs = req_data.llm.encode({
+        "prompt": req_data.prompt,
+        "multi_modal_data": mm_data,
+    })
 
     for output in outputs:
         print(output.outputs.embedding)
 
 
 def main(args: Namespace):
-    run_encode(args.model, args.modality)
+    run_encode(args.model_name, args.modality)
 
 
 model_example_map = {

From b18e460c2879e3d55bc2cc1a7b34161e14491731 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 12:20:25 +0000
Subject: [PATCH 07/12] Fix error when image index is larger than vocab size

---
 vllm/model_executor/models/llava_next.py | 18 +++---
 vllm/model_executor/models/utils.py      | 78 +++++++++++++++++++++---
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index eef6bda109a9..46cba8ebbc58 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -30,8 +30,8 @@
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip, get_siglip_image_feature_size,
                      get_siglip_patch_grid_length, input_processor_for_siglip)
-from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
+                    init_vllm_registered_model)
 
 # Result in the max possible feature size (2x2 grid of 336x336px tiles)
 MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
@@ -611,14 +611,12 @@ def forward(
             image_input = self._parse_and_validate_image_input(**kwargs)
 
             if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.config.image_token_index)
-
+                inputs_embeds = embed_multimodal(
+                    input_ids,
+                    self.config.image_token_index,
+                    self.language_model.model.get_input_embeddings,
+                    lambda _: self._process_image_input(image_input),
+                )
                 input_ids = None
             else:
                 inputs_embeds = None
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 9e2f5476f3af..644337d874e3 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,7 +1,7 @@
 import itertools
 from dataclasses import dataclass, field
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Tuple, Union, overload)
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Protocol, Tuple, Union, overload)
 
 import torch
 import torch.nn as nn
@@ -294,10 +294,11 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
         _embedding_count_expression(inner) for inner in embeddings)
 
 
-def merge_multimodal_embeddings(input_ids: torch.Tensor,
-                                inputs_embeds: torch.Tensor,
-                                multimodal_embeddings: NestedTensors,
-                                placeholder_token_id: int) -> torch.Tensor:
+def _merge_multimodal_embeddings(
+    inputs_embeds: torch.Tensor,
+    is_multimodal: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+) -> torch.Tensor:
     """
     Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
     positions in ``inputs_embeds`` corresponding to placeholder tokens in
@@ -306,8 +307,7 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor,
     Note:
         This updates ``inputs_embeds`` in place.
     """
-    mask = (input_ids == placeholder_token_id)
-    num_expected_tokens = mask.sum().item()
+    num_expected_tokens = is_multimodal.sum().item()
     assert isinstance(num_expected_tokens, int)
 
     flattened = _flatten_embeddings(multimodal_embeddings)
@@ -317,10 +317,70 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor,
             f"Attempted to assign {expr} = {flattened.shape[0]} "
             f"multimodal tokens to {num_expected_tokens} placeholders")
 
-    inputs_embeds[mask] = flattened
+    inputs_embeds[is_multimodal] = flattened
     return inputs_embeds
 
 
+def embed_multimodal(
+    input_ids: torch.Tensor,
+    multimodal_token_id: int,
+    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
+    get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor,
+                                                          List[torch.Tensor]]],
+) -> torch.Tensor:
+    """
+    Embed token IDs and multimodal inputs and combine their embeddings.
+
+    ``multimodal_token_id`` is used to determine whether a token ID should
+    be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``.
+
+    Compared to ``merge_multimodal_embeddings`, this avoids running
+    ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]``
+    which causes issues when the placeholder token ID exceeds the
+    vocabulary size of the language model.
+    """
+    is_multimodal = input_ids == multimodal_token_id
+    is_text = ~is_multimodal
+
+    text_embeds = get_text_embeds(input_ids[is_text])
+    multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal])
+
+    merged_embeds = torch.empty(
+        (input_ids.shape[0], text_embeds.shape[1]),
+        dtype=text_embeds.dtype,
+        device=text_embeds.device,
+    )
+
+    merged_embeds[is_text] = text_embeds
+
+    return _merge_multimodal_embeddings(
+        merged_embeds,
+        is_multimodal,
+        multimodal_embeds,
+    )
+
+
+def merge_multimodal_embeddings(
+    input_ids: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+    placeholder_token_id: int,
+) -> torch.Tensor:
+    """
+    Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the
+    positions in ``inputs_embeds`` corresponding to placeholder tokens in
+    ``input_ids``.
+
+    Note:
+        This updates ``inputs_embeds`` in place.
+    """
+    return _merge_multimodal_embeddings(
+        inputs_embeds,
+        (input_ids == placeholder_token_id),
+        multimodal_embeddings,
+    )
+
+
 class LayerFn(Protocol):
 
     def __call__(self, prefix: str) -> torch.nn.Module:

From 0aefb1b4261151c691dce480e8d0d8e9633f5339 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 12:21:32 +0000
Subject: [PATCH 08/12] Fix `auto_cls`

---
 tests/models/embedding/vision_language/test_llava_next.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 1fd85b733ce8..2669f213d17a 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -1,5 +1,6 @@
 import pytest
 import torch.nn.functional as F
+from transformers import AutoModelForVision2Seq
 
 from ....conftest import IMAGE_ASSETS
 from ..utils import check_embeddings_close
@@ -56,7 +57,8 @@ def test_models(
                      enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
-    with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []

From c8b41c3476c3238435fa224a47c6d7c1ba222efe Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 12:28:15 +0000
Subject: [PATCH 09/12] Remove print statement

---
 vllm/model_executor/models/phi3v.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 91c14e32c946..9a1083520efd 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -467,8 +467,6 @@ def input_processor_for_phi3v(ctx: InputContext,
 
     prompt_token_ids = inputs["prompt_token_ids"].copy()
 
-    print("prompt_token_ids (old)", prompt_token_ids)
-
     # masked placeholder with image token id
     for idx in image_idx:
         candidates = _get_image_placeholder_token_id_candidates(model_config,

From cb86ebbc12c3a24a986ed1be278c68ec06aedf27 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 14:04:47 +0000
Subject: [PATCH 10/12] Update docs

---
 docs/source/models/supported_models.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index d59385437cca..6eb8cc068f66 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -334,6 +334,14 @@ The following modalities are supported depending on the model:
 - **V**\ ideo
 - **A**\ udio
 
+Any combination of modalities joined by :code:`+` are supported.
+
+- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+
+On the other hand, modalities separated by :code:`/` are mutually exclusive.
+
+- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+
 .. _supported_vlms:
 
 Text Generation
@@ -492,7 +500,7 @@ Multimodal Embedding
     - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
-    - T / I / T + I
+    - T + I
     - :code:`TIGER-Lab/VLM2Vec-Full`
     - 🚧
     - ✅︎

From b36f044f2db23d03f23e354c453f82dcd9a969cd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 14:29:44 +0000
Subject: [PATCH 11/12] Fix test

---
 tests/models/embedding/vision_language/test_llava_next.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 2669f213d17a..a56ac994f7d4 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -59,6 +59,11 @@ def test_models(
 
     with hf_runner(model, dtype=dtype,
                    auto_cls=AutoModelForVision2Seq) as hf_model:
+        # Patch the issue where image_token_id
+        # exceeds the maximum allowed vocab size
+        hf_model.model.resize_token_embeddings(
+            hf_model.model.language_model.vocab_size + 1)
+
         all_inputs = hf_model.get_inputs(input_texts, images=input_images)
 
         all_outputs = []

From ee1bd2ae5d6e88ac10c28e169144632f712c4dea Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 22 Oct 2024 16:57:37 +0000
Subject: [PATCH 12/12] Fix tests

---
 tests/models/embedding/utils.py               |  3 +-
 .../vision_language/test_llava_next.py        | 77 +++++++++++++++----
 .../embedding/vision_language/test_phi3v.py   | 75 ++++++++++++++----
 3 files changed, 123 insertions(+), 32 deletions(-)

diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index 2fcc2013d91e..fd1c44d9c117 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -16,7 +16,8 @@ def check_embeddings_close(
 
     for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
             zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1)
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
 
         sim = F.cosine_similarity(torch.tensor(embeddings_0),
                                   torch.tensor(embeddings_1),
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index a56ac994f7d4..52aef8c34d6f 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -1,8 +1,11 @@
+from typing import List, Type
+
 import pytest
 import torch.nn.functional as F
 from transformers import AutoModelForVision2Seq
 
-from ....conftest import IMAGE_ASSETS
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
 from ..utils import check_embeddings_close
 
 llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
@@ -29,23 +32,15 @@
 MODELS = ["royokong/e5-v"]
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
     model: str,
+    *,
     dtype: str,
 ) -> None:
-    input_texts_images = [
-        *((text, None) for text in HF_TEXT_PROMPTS),
-        *((text, asset.pil_image)
-          for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)),
-    ]
-    input_texts = [text for text, _ in input_texts_images]
-    input_images = [image for _, image in input_texts_images]
-
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -75,7 +70,7 @@ def test_models(
                 return_dict=True,
                 output_hidden_states=True,
             )
-            pooled_output = F.normalize(outputs.hidden_states[-1][:, -1, :],
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
                                         dim=-1)
 
             all_outputs.append(pooled_output.tolist())
@@ -88,3 +83,53 @@ def test_models(
         name_0="hf",
         name_1="vllm",
     )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index e913123995eb..ee411472ba28 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -1,7 +1,10 @@
+from typing import List, Type
+
 import pytest
 import torch.nn.functional as F
 
-from ....conftest import IMAGE_ASSETS
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
 from ..utils import check_embeddings_close
 
 HF_TEXT_PROMPTS = [
@@ -23,23 +26,15 @@
 MODELS = ["TIGER-Lab/VLM2Vec-Full"]
 
 
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    image_assets,
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
     model: str,
+    *,
     dtype: str,
 ) -> None:
-    input_texts_images = [
-        *((text, None) for text in HF_TEXT_PROMPTS),
-        *((text, asset.pil_image)
-          for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)),
-    ]
-    input_texts = [text for text, _ in input_texts_images]
-    input_images = [image for _, image in input_texts_images]
-
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -77,3 +72,53 @@ def test_models(
         name_0="hf",
         name_1="vllm",
     )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )