Skip to content

Commit f6b32ef

Browse files
authored
[Bugfix] Fix and reorganize broken GGUF tests and bump gguf version (#16194)
Signed-off-by: Isotr0py <[email protected]>
1 parent b99733d commit f6b32ef

File tree

2 files changed

+61
-21
lines changed

2 files changed

+61
-21
lines changed

requirements/common.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/31
2828
partial-json-parser # used for parsing partial JSON outputs
2929
pyzmq
3030
msgspec
31-
gguf == 0.10.0
31+
gguf >= 0.13.0
3232
importlib_metadata
3333
mistral_common[opencv] >= 1.5.4
3434
opencv-python-headless >= 4.11.0 # required for video IO

tests/models/decoder_only/language/test_gguf.py

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99

1010
import pytest
1111
from huggingface_hub import hf_hub_download
12+
from pytest import MarkDecorator
1213
from transformers import AutoTokenizer
1314

1415
from tests.quantization.utils import is_quant_method_supported
1516

1617
from ....conftest import VllmRunner
18+
from ....utils import multi_gpu_test
1719
from ...utils import check_logprobs_close
1820

1921
os.environ["TOKENIZERS_PARALLELISM"] = "true"
@@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
2527
original_model: str
2628
gguf_repo: str
2729
gguf_filename: str
30+
marks: list[MarkDecorator] = []
2831

2932
@property
3033
def gguf_model(self):
@@ -35,6 +38,7 @@ def gguf_model(self):
3538
original_model="meta-llama/Llama-3.2-1B-Instruct",
3639
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
3740
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
41+
marks=[pytest.mark.quant_model],
3842
)
3943

4044
QWEN2_CONFIG = GGUFTestConfig(
@@ -81,34 +85,24 @@ def gguf_model(self):
8185
]
8286

8387

84-
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
85-
reason="gguf is not supported on this GPU type.")
86-
@pytest.mark.parametrize("model", MODELS)
87-
@pytest.mark.parametrize("dtype", ["half"])
88-
@pytest.mark.parametrize("max_tokens", [32])
89-
@pytest.mark.parametrize("num_logprobs", [5])
90-
@pytest.mark.parametrize("tp_size", [1, 2])
91-
def test_models(
92-
num_gpus_available: int,
88+
def check_model_outputs(
9389
vllm_runner: type[VllmRunner],
94-
example_prompts: list[str],
90+
prompts: list[str],
9591
model: GGUFTestConfig,
9692
dtype: str,
9793
max_tokens: int,
9894
num_logprobs: int,
9995
tp_size: int,
100-
) -> None:
101-
if num_gpus_available < tp_size:
102-
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
103-
96+
):
10497
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
10598
if tokenizer.chat_template is not None:
10699
messages = [[{
107100
'role': 'user',
108101
'content': prompt
109-
}] for prompt in example_prompts]
110-
example_prompts = tokenizer.apply_chat_template(
111-
messages, tokenize=False, add_generation_prompt=True)
102+
}] for prompt in prompts]
103+
prompts = tokenizer.apply_chat_template(messages,
104+
tokenize=False,
105+
add_generation_prompt=True)
112106

113107
# Run gguf model.
114108
with vllm_runner(model_name=model.gguf_model,
@@ -118,21 +112,67 @@ def test_models(
118112
max_model_len=MAX_MODEL_LEN,
119113
tensor_parallel_size=tp_size) as gguf_model:
120114
gguf_outputs = gguf_model.generate_greedy_logprobs(
121-
example_prompts[:-1], max_tokens, num_logprobs)
115+
prompts[:-1], max_tokens, num_logprobs)
122116

123117
# Run unquantized model.
118+
# Should run with tp=1, otherwise the test will stuck at
119+
# nccl initialization.
124120
with vllm_runner(
125121
model_name=model.original_model,
126122
enforce_eager=True, # faster tests
127123
dtype=dtype,
128124
max_model_len=MAX_MODEL_LEN,
129-
tensor_parallel_size=tp_size) as original_model:
125+
tensor_parallel_size=1) as original_model:
130126
original_outputs = original_model.generate_greedy_logprobs(
131-
example_prompts[:-1], max_tokens, num_logprobs)
127+
prompts[:-1], max_tokens, num_logprobs)
132128

133129
check_logprobs_close(
134130
outputs_0_lst=original_outputs,
135131
outputs_1_lst=gguf_outputs,
136132
name_0="original",
137133
name_1="gguf",
138134
)
135+
136+
137+
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
138+
reason="gguf is not supported on this GPU type.")
139+
@pytest.mark.parametrize("model", [
140+
pytest.param(test_config, marks=test_config.marks)
141+
for test_config in MODELS
142+
])
143+
@pytest.mark.parametrize("dtype", ["half"])
144+
@pytest.mark.parametrize("max_tokens", [32])
145+
@pytest.mark.parametrize("num_logprobs", [5])
146+
@pytest.mark.parametrize("tp_size", [1])
147+
def test_models(
148+
vllm_runner: type[VllmRunner],
149+
example_prompts: list[str],
150+
model: GGUFTestConfig,
151+
dtype: str,
152+
max_tokens: int,
153+
num_logprobs: int,
154+
tp_size: int,
155+
) -> None:
156+
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
157+
num_logprobs, tp_size)
158+
159+
160+
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
161+
reason="gguf is not supported on this GPU type.")
162+
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
163+
@pytest.mark.parametrize("dtype", ["half"])
164+
@pytest.mark.parametrize("max_tokens", [8])
165+
@pytest.mark.parametrize("num_logprobs", [5])
166+
@pytest.mark.parametrize("tp_size", [2])
167+
@multi_gpu_test(num_gpus=2)
168+
def test_distributed(
169+
vllm_runner: type[VllmRunner],
170+
example_prompts: list[str],
171+
model: GGUFTestConfig,
172+
dtype: str,
173+
max_tokens: int,
174+
num_logprobs: int,
175+
tp_size: int,
176+
) -> None:
177+
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
178+
num_logprobs, tp_size)

0 commit comments

Comments
 (0)