|
10 | 10 |
|
11 | 11 | import pytest |
12 | 12 | from transformers import (AutoModel, AutoModelForImageTextToText, |
13 | | - AutoModelForTextToWaveform, AutoModelForVision2Seq) |
| 13 | + AutoModelForTextToWaveform) |
14 | 14 |
|
15 | 15 | from vllm.platforms import current_platform |
16 | 16 | from vllm.utils import identity |
|
137 | 137 | video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 |
138 | 138 | max_model_len=4096, |
139 | 139 | max_num_seqs=2, |
140 | | - auto_cls=AutoModelForVision2Seq, |
| 140 | + auto_cls=AutoModelForImageTextToText, |
141 | 141 | vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
142 | 142 | image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
143 | 143 | marks=[pytest.mark.core_model, pytest.mark.cpu_model], |
|
502 | 502 | num_video_frames=16, |
503 | 503 | max_model_len=16384, |
504 | 504 | hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 |
505 | | - auto_cls=AutoModelForVision2Seq, |
| 505 | + auto_cls=AutoModelForImageTextToText, |
506 | 506 | vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, |
507 | 507 | custom_test_opts=[CustomTestOptions( |
508 | 508 | inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( |
|
518 | 518 | num_video_frames=16, |
519 | 519 | max_model_len=4096, |
520 | 520 | max_num_seqs=2, |
521 | | - auto_cls=AutoModelForVision2Seq, |
| 521 | + auto_cls=AutoModelForImageTextToText, |
522 | 522 | vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, |
523 | 523 | ), |
524 | 524 | "mantis": VLMTestInfo( |
|
680 | 680 | multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501 |
681 | 681 | max_model_len=4096, |
682 | 682 | max_num_seqs=2, |
683 | | - auto_cls=AutoModelForVision2Seq, |
| 683 | + auto_cls=AutoModelForImageTextToText, |
684 | 684 | vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
685 | 685 | image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
686 | 686 | marks=[pytest.mark.cpu_model], |
|
784 | 784 | test_type=VLMTestType.CUSTOM_INPUTS, |
785 | 785 | max_model_len=16384, |
786 | 786 | max_num_seqs=2, |
787 | | - auto_cls=AutoModelForVision2Seq, |
| 787 | + auto_cls=AutoModelForImageTextToText, |
788 | 788 | hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 |
789 | 789 | vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, |
790 | 790 | custom_test_opts=[CustomTestOptions( |
|
800 | 800 | test_type=VLMTestType.CUSTOM_INPUTS, |
801 | 801 | max_model_len=4096, |
802 | 802 | max_num_seqs=2, |
803 | | - auto_cls=AutoModelForVision2Seq, |
| 803 | + auto_cls=AutoModelForImageTextToText, |
804 | 804 | vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, |
805 | 805 | custom_test_opts=[CustomTestOptions( |
806 | 806 | inputs=custom_inputs.windows_attention_image_qwen2_5_vl(), |
|
0 commit comments