99
1010import pytest
1111from huggingface_hub import hf_hub_download
12+ from pytest import MarkDecorator
1213from transformers import AutoTokenizer
1314
1415from tests .quantization .utils import is_quant_method_supported
1516
1617from ....conftest import VllmRunner
18+ from ....utils import multi_gpu_test
1719from ...utils import check_logprobs_close
1820
1921os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
2527 original_model : str
2628 gguf_repo : str
2729 gguf_filename : str
30+ marks : list [MarkDecorator ] = []
2831
2932 @property
3033 def gguf_model (self ):
@@ -35,6 +38,7 @@ def gguf_model(self):
3538 original_model = "meta-llama/Llama-3.2-1B-Instruct" ,
3639 gguf_repo = "bartowski/Llama-3.2-1B-Instruct-GGUF" ,
3740 gguf_filename = "Llama-3.2-1B-Instruct-IQ4_XS.gguf" ,
41+ marks = [pytest .mark .quant_model ],
3842)
3943
4044QWEN2_CONFIG = GGUFTestConfig (
@@ -81,34 +85,24 @@ def gguf_model(self):
8185]
8286
8387
84- @pytest .mark .skipif (not is_quant_method_supported ("gguf" ),
85- reason = "gguf is not supported on this GPU type." )
86- @pytest .mark .parametrize ("model" , MODELS )
87- @pytest .mark .parametrize ("dtype" , ["half" ])
88- @pytest .mark .parametrize ("max_tokens" , [32 ])
89- @pytest .mark .parametrize ("num_logprobs" , [5 ])
90- @pytest .mark .parametrize ("tp_size" , [1 , 2 ])
91- def test_models (
92- num_gpus_available : int ,
88+ def check_model_outputs (
9389 vllm_runner : type [VllmRunner ],
94- example_prompts : list [str ],
90+ prompts : list [str ],
9591 model : GGUFTestConfig ,
9692 dtype : str ,
9793 max_tokens : int ,
9894 num_logprobs : int ,
9995 tp_size : int ,
100- ) -> None :
101- if num_gpus_available < tp_size :
102- pytest .skip (f"Not enough GPUs for tensor parallelism { tp_size } " )
103-
96+ ):
10497 tokenizer = AutoTokenizer .from_pretrained (model .original_model )
10598 if tokenizer .chat_template is not None :
10699 messages = [[{
107100 'role' : 'user' ,
108101 'content' : prompt
109- }] for prompt in example_prompts ]
110- example_prompts = tokenizer .apply_chat_template (
111- messages , tokenize = False , add_generation_prompt = True )
102+ }] for prompt in prompts ]
103+ prompts = tokenizer .apply_chat_template (messages ,
104+ tokenize = False ,
105+ add_generation_prompt = True )
112106
113107 # Run gguf model.
114108 with vllm_runner (model_name = model .gguf_model ,
@@ -118,21 +112,67 @@ def test_models(
118112 max_model_len = MAX_MODEL_LEN ,
119113 tensor_parallel_size = tp_size ) as gguf_model :
120114 gguf_outputs = gguf_model .generate_greedy_logprobs (
121- example_prompts [:- 1 ], max_tokens , num_logprobs )
115+ prompts [:- 1 ], max_tokens , num_logprobs )
122116
123117 # Run unquantized model.
118+ # Should run with tp=1, otherwise the test will stuck at
119+ # nccl initialization.
124120 with vllm_runner (
125121 model_name = model .original_model ,
126122 enforce_eager = True , # faster tests
127123 dtype = dtype ,
128124 max_model_len = MAX_MODEL_LEN ,
129- tensor_parallel_size = tp_size ) as original_model :
125+ tensor_parallel_size = 1 ) as original_model :
130126 original_outputs = original_model .generate_greedy_logprobs (
131- example_prompts [:- 1 ], max_tokens , num_logprobs )
127+ prompts [:- 1 ], max_tokens , num_logprobs )
132128
133129 check_logprobs_close (
134130 outputs_0_lst = original_outputs ,
135131 outputs_1_lst = gguf_outputs ,
136132 name_0 = "original" ,
137133 name_1 = "gguf" ,
138134 )
135+
136+
137+ @pytest .mark .skipif (not is_quant_method_supported ("gguf" ),
138+ reason = "gguf is not supported on this GPU type." )
139+ @pytest .mark .parametrize ("model" , [
140+ pytest .param (test_config , marks = test_config .marks )
141+ for test_config in MODELS
142+ ])
143+ @pytest .mark .parametrize ("dtype" , ["half" ])
144+ @pytest .mark .parametrize ("max_tokens" , [32 ])
145+ @pytest .mark .parametrize ("num_logprobs" , [5 ])
146+ @pytest .mark .parametrize ("tp_size" , [1 ])
147+ def test_models (
148+ vllm_runner : type [VllmRunner ],
149+ example_prompts : list [str ],
150+ model : GGUFTestConfig ,
151+ dtype : str ,
152+ max_tokens : int ,
153+ num_logprobs : int ,
154+ tp_size : int ,
155+ ) -> None :
156+ check_model_outputs (vllm_runner , example_prompts , model , dtype , max_tokens ,
157+ num_logprobs , tp_size )
158+
159+
160+ @pytest .mark .skipif (not is_quant_method_supported ("gguf" ),
161+ reason = "gguf is not supported on this GPU type." )
162+ @pytest .mark .parametrize ("model" , [LLAMA_CONFIG ])
163+ @pytest .mark .parametrize ("dtype" , ["half" ])
164+ @pytest .mark .parametrize ("max_tokens" , [8 ])
165+ @pytest .mark .parametrize ("num_logprobs" , [5 ])
166+ @pytest .mark .parametrize ("tp_size" , [2 ])
167+ @multi_gpu_test (num_gpus = 2 )
168+ def test_distributed (
169+ vllm_runner : type [VllmRunner ],
170+ example_prompts : list [str ],
171+ model : GGUFTestConfig ,
172+ dtype : str ,
173+ max_tokens : int ,
174+ num_logprobs : int ,
175+ tp_size : int ,
176+ ) -> None :
177+ check_model_outputs (vllm_runner , example_prompts , model , dtype , max_tokens ,
178+ num_logprobs , tp_size )
0 commit comments