Skip to content

Commit d1ca7df

Browse files
[VLM] Merged multi-modal processor for InternVL-based models (#12553)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]>
1 parent 96b2362 commit d1ca7df

34 files changed

+1434
-986
lines changed

docs/source/contributing/model/multimodal.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int:
250250
And thus, we can override the method as:
251251

252252
```python
253-
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
253+
def get_mm_max_tokens_per_item(
254+
self,
255+
seq_len: int,
256+
mm_counts: Mapping[str, int],
257+
) -> Mapping[str, int]:
254258
return {"image": self.get_max_image_tokens()}
255259
```
256260

docs/source/models/supported_models.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ See [this page](#generative-models) for more information on how to use generativ
726726
* `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
727727
*
728728
* ✅︎
729-
*
729+
* \*
730730
- * `Idefics3ForConditionalGeneration`
731731
* Idefics3
732732
* T + I
@@ -799,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ
799799
* ✅︎
800800
- * `NVLM_D_Model`
801801
* NVLM-D 1.0
802-
* T + I<sup>E+</sup>
802+
* T + I<sup>+</sup>
803803
* `nvidia/NVLM-D-72B`, etc.
804804
*
805805
* ✅︎
@@ -859,7 +859,11 @@ See [this page](#generative-models) for more information on how to use generativ
859859
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
860860

861861
:::{note}
862-
To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
862+
To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
863+
:::
864+
865+
:::{note}
866+
H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
863867
:::
864868

865869
:::{note}

tests/models/decoder_only/vision_language/test_h2ovl.py

Lines changed: 0 additions & 131 deletions
This file was deleted.

tests/models/decoder_only/vision_language/test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@
250250
max_model_len=8192,
251251
dtype="bfloat16",
252252
use_tokenizer_eos=True,
253+
num_logprobs=10,
253254
patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
254255
),
255256
"idefics3": VLMTestInfo(
@@ -282,7 +283,6 @@
282283
dtype="bfloat16",
283284
use_tokenizer_eos=True,
284285
patch_hf_runner=model_utils.internvl_patch_hf_runner,
285-
marks=[large_gpu_mark(min_gb=32)],
286286
),
287287
"llava_next": VLMTestInfo(
288288
models=["llava-hf/llava-v1.6-mistral-7b-hf"],

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -334,12 +334,12 @@ class H2OVLProcessor:
334334
def __init__(self, hf_runner: HfRunner):
335335
self.num_image_token = hf_runner.model.num_image_token
336336
self.tokenizer = hf_runner.tokenizer
337-
self.dtype = hf_runner.model.dtype
338337

339338
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
340339
trust_remote_code=True)
341340
self.vision_config = self.config.vision_config
342341
self.use_thumbnail = self.config.use_thumbnail
342+
self.use_msac = self.config.use_msac
343343
self.min_num = self.config.min_dynamic_patch
344344
self.max_num = self.config.max_dynamic_patch
345345
self.image_size = self.vision_config.image_size
@@ -348,18 +348,19 @@ def __call__(self, text: str, images: Union[Image, List[Image]],
348348
**kwargs):
349349
# yapf: disable
350350
from vllm.model_executor.models.h2ovl import (
351-
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
351+
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
352352

353353
# yapf: enable
354354
images = [images] if isinstance(images, Image) else images
355355
pixel_values = [
356-
image_to_pixel_values(image,
357-
self.image_size,
358-
self.min_num,
359-
self.max_num,
360-
self.use_thumbnail,
361-
use_MSAC=self.config.use_msac).to(
362-
self.dtype) for image in images
356+
image_to_pixel_values_h2ovl(
357+
image,
358+
input_size=self.image_size,
359+
min_num=self.min_num,
360+
max_num=self.max_num,
361+
use_thumbnail=self.use_thumbnail,
362+
use_msac=self.use_msac,
363+
) for image in images
363364
]
364365
num_patches_list = [
365366
pixel_value.shape[0] for pixel_value in pixel_values
@@ -394,7 +395,6 @@ class InternVLProcessor:
394395
def __init__(self, hf_runner: HfRunner):
395396
self.num_image_token = hf_runner.model.num_image_token
396397
self.tokenizer = hf_runner.tokenizer
397-
self.dtype = hf_runner.model.dtype
398398

399399
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
400400
trust_remote_code=True)
@@ -407,13 +407,17 @@ def __init__(self, hf_runner: HfRunner):
407407
def __call__(self, text: str, images: Union[Image, List[Image]],
408408
**kwargs):
409409
from vllm.model_executor.models.internvl import (
410-
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
410+
IMG_CONTEXT, IMG_END, IMG_START,
411+
image_to_pixel_values_internvl)
411412
images = [images] if isinstance(images, Image) else images
412413
pixel_values = [
413-
image_to_pixel_values(image, self.image_size, self.min_num,
414-
self.max_num,
415-
self.use_thumbnail).to(self.dtype)
416-
for image in images
414+
image_to_pixel_values_internvl(
415+
image,
416+
input_size=self.image_size,
417+
min_num=self.min_num,
418+
max_num=self.max_num,
419+
use_thumbnail=self.use_thumbnail,
420+
) for image in images
417421
]
418422
num_patches_list = [
419423
pixel_value.shape[0] for pixel_value in pixel_values
@@ -448,7 +452,8 @@ def _internvl_generate(
448452
) -> torch.LongTensor:
449453
"""Generate method for InternVL2 model without fixed use_cache."""
450454
assert self.img_context_token_id is not None
451-
vit_embeds = self.extract_feature(pixel_values)
455+
target_dtype = next(self.parameters()).dtype
456+
vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
452457
input_embeds = self.language_model.get_input_embeddings()(input_ids)
453458
B, N, C = input_embeds.shape
454459
input_embeds = input_embeds.reshape(B * N, C)

tests/models/multimodal/processing/test_common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,13 +141,14 @@ def _test_processing_correctness(
141141

142142

143143
# yapf: disable
144-
# True if the model supports multiple data items of the modality per request
145144
@pytest.mark.parametrize("model_id", [
146145
"rhymes-ai/Aria",
147146
"Salesforce/blip2-opt-2.7b",
148147
"facebook/chameleon-7b",
149148
"deepseek-ai/deepseek-vl2-tiny",
150149
"adept/fuyu-8b",
150+
"h2oai/h2ovl-mississippi-800m",
151+
"OpenGVLab/InternVL2-1B",
151152
"llava-hf/llava-1.5-7b-hf",
152153
"llava-hf/llava-v1.6-mistral-7b-hf",
153154
"llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -156,6 +157,7 @@ def _test_processing_correctness(
156157
"mistral-community/pixtral-12b",
157158
"openbmb/MiniCPM-o-2_6",
158159
"openbmb/MiniCPM-V-2_6",
160+
"nvidia/NVLM-D-72B",
159161
"Qwen/Qwen-VL-Chat",
160162
"Qwen/Qwen2-VL-2B-Instruct",
161163
"Qwen/Qwen2-Audio-7B-Instruct",

0 commit comments

Comments
 (0)