2525
2626import pytest
2727import vllm # noqa: F401
28+ from modelscope import snapshot_download # type: ignore[import-untyped]
2829from vllm import SamplingParams
2930from vllm .assets .image import ImageAsset
3031
3334
3435MODELS = [
3536 "Qwen/Qwen2.5-0.5B-Instruct" ,
36- "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8" ,
3737 "Qwen/Qwen3-0.6B-Base" ,
3838]
3939MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct" ]
40+
41+ QUANTIZATION_MODELS = [
42+ "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new" ,
43+ "vllm-ascend/DeepSeek-V2-Lite-W8A8"
44+ ]
4045os .environ ["PYTORCH_NPU_ALLOC_CONF" ] = "max_split_size_mb:256"
4146
4247
@@ -59,6 +64,27 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
5964 vllm_model .generate_greedy (example_prompts , max_tokens )
6065
6166
67+ @pytest .mark .parametrize ("model" , QUANTIZATION_MODELS )
68+ @pytest .mark .parametrize ("max_tokens" , [5 ])
69+ def test_quantization_models (model : str , max_tokens : int ) -> None :
70+ prompt = "The following numbers of the sequence " + ", " .join (
71+ str (i ) for i in range (1024 )) + " are:"
72+ example_prompts = [prompt ]
73+
74+ # NOTE: Using quantized model repo id from modelscope encounters an issue,
75+ # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue,
76+ # after it is being merged, there's no need to download model explicitly.
77+
78+ model_path = snapshot_download (model )
79+
80+ with VllmRunner (model_path ,
81+ max_model_len = 8192 ,
82+ enforce_eager = True ,
83+ gpu_memory_utilization = 0.7 ,
84+ quantization = "ascend" ) as vllm_model :
85+ vllm_model .generate_greedy (example_prompts , max_tokens )
86+
87+
6288@pytest .mark .parametrize ("model" , MULTIMODALITY_MODELS )
6389def test_multimodal (model , prompt_template , vllm_runner ):
6490 image = ImageAsset ("cherry_blossom" ) \
0 commit comments