|
113 | 113 | dtype="bfloat16" if current_platform.is_cpu() else "auto", |
114 | 114 | marks=[pytest.mark.core_model, pytest.mark.cpu_model], |
115 | 115 | ), |
116 | | - "paligemma": VLMTestInfo( |
117 | | - models=["google/paligemma-3b-mix-224"], |
118 | | - test_type=VLMTestType.IMAGE, |
119 | | - prompt_formatter=identity, |
120 | | - img_idx_to_prompt=lambda idx: "", |
121 | | - # Paligemma uses its own sample prompts because the default one fails |
122 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
123 | | - { |
124 | | - "stop_sign": "caption es", |
125 | | - "cherry_blossom": "What is in the picture?", |
126 | | - } |
127 | | - ), |
128 | | - auto_cls=AutoModelForImageTextToText, |
129 | | - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
130 | | - dtype="bfloat16", |
131 | | - marks=[ |
132 | | - pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") |
133 | | - ], |
134 | | - ), |
135 | 116 | "qwen2_5_vl": VLMTestInfo( |
136 | 117 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
137 | 118 | test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), |
|
196 | 177 | # Gemma3 has bidirectional mask on images |
197 | 178 | "gemma3-transformers": VLMTestInfo( |
198 | 179 | models=["google/gemma-3-4b-it"], |
199 | | - test_type=VLMTestType.IMAGE, |
200 | | - prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
201 | | - max_model_len=4096, |
| 180 | + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
| 181 | + prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
| 182 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 183 | + { |
| 184 | + "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
| 185 | + "cherry_blossom": "<start_of_image>What is the season?", |
| 186 | + } |
| 187 | + ), |
| 188 | + multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
| 189 | + max_model_len=8192, |
202 | 190 | auto_cls=AutoModelForImageTextToText, |
| 191 | + # TODO: Support `do_pan_and_scan` in transformers backend |
| 192 | + # patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
203 | 193 | vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output, |
204 | 194 | image_size_factors=[(0.25, 0.5, 1.0)], |
205 | 195 | vllm_runner_kwargs={ |
206 | 196 | "model_impl": "transformers", |
| 197 | + # "mm_processor_kwargs": {"do_pan_and_scan": True}, |
207 | 198 | }, |
208 | 199 | marks=[pytest.mark.core_model], |
209 | 200 | ), |
|
222 | 213 | }, |
223 | 214 | marks=[pytest.mark.core_model], |
224 | 215 | ), |
| 216 | + # PaliGemma has PrefixLM attention |
| 217 | + "paligemma-transformers": VLMTestInfo( |
| 218 | + models=["google/paligemma-3b-mix-224"], |
| 219 | + test_type=VLMTestType.IMAGE, |
| 220 | + prompt_formatter=identity, |
| 221 | + img_idx_to_prompt=lambda idx: "", |
| 222 | + # PaliGemma uses its own sample prompts because the default one fails |
| 223 | + single_image_prompts=IMAGE_ASSETS.prompts( |
| 224 | + { |
| 225 | + "stop_sign": "caption es", |
| 226 | + "cherry_blossom": "What is in the picture?", |
| 227 | + } |
| 228 | + ), |
| 229 | + auto_cls=AutoModelForImageTextToText, |
| 230 | + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, |
| 231 | + image_size_factors=[(0.25, 0.5, 1.0)], |
| 232 | + vllm_runner_kwargs={ |
| 233 | + "model_impl": "transformers", |
| 234 | + }, |
| 235 | + marks=[pytest.mark.core_model], |
| 236 | + ), |
225 | 237 | # Pixel values from processor are not 4D or 5D arrays |
226 | 238 | "qwen2_5_vl-transformers": VLMTestInfo( |
227 | 239 | models=["Qwen/Qwen2.5-VL-3B-Instruct"], |
|
348 | 360 | image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], |
349 | 361 | marks=[large_gpu_mark(min_gb=32)], |
350 | 362 | ), |
351 | | - "gemma3": VLMTestInfo( |
352 | | - models=["google/gemma-3-4b-it"], |
353 | | - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), |
354 | | - prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 |
355 | | - single_image_prompts=IMAGE_ASSETS.prompts( |
356 | | - { |
357 | | - "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 |
358 | | - "cherry_blossom": "<start_of_image>What is the season?", |
359 | | - } |
360 | | - ), |
361 | | - multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 |
362 | | - max_model_len=4096, |
363 | | - max_num_seqs=2, |
364 | | - auto_cls=AutoModelForImageTextToText, |
365 | | - vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, |
366 | | - patch_hf_runner=model_utils.gemma3_patch_hf_runner, |
367 | | - num_logprobs=10, |
368 | | - ), |
369 | 363 | "glm4v": VLMTestInfo( |
370 | 364 | models=["zai-org/glm-4v-9b"], |
371 | 365 | test_type=VLMTestType.IMAGE, |
|
0 commit comments