Skip to content

Commit 4dbfcf7

Browse files
committed
Move e2e tests to new file, add to test pipeline
Signed-off-by: Luka Govedič <[email protected]>
1 parent d3f95fe commit 4dbfcf7

File tree

3 files changed

+255
-202
lines changed

3 files changed

+255
-202
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -796,8 +796,8 @@ steps:
796796
# Whisper needs spawn method to avoid deadlock
797797
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
798798

799-
- label: Blackwell Test # 38 min
800-
timeout_in_minutes: 60
799+
- label: Blackwell Test # 48 min
800+
timeout_in_minutes: 70
801801
working_dir: "/vllm-workspace/"
802802
gpu: b200
803803
# optional: true
@@ -810,8 +810,7 @@ steps:
810810
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
811811
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
812812
- vllm/v1/attention/backends/flashinfer.py
813-
- vllm/compilation/fusion.py
814-
- vllm/compilation/fusion_attn.py
813+
- vllm/compilation/
815814
commands:
816815
- nvidia-smi
817816
- python3 examples/offline_inference/basic/chat.py
@@ -828,15 +827,16 @@ steps:
828827
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
829828
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
830829
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
830+
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
831+
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
831832
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
832833
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
833834
# Fusion
834835
- pytest -v -s tests/compile/test_fusion_all_reduce.py
835836
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
836837
- pytest -v -s tests/kernels/moe/test_flashinfer.py
837838
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
838-
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
839-
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
839+
- pytest -v -s tests/compile/test_fusions_e2e.py
840840

841841
- label: Blackwell GPT-OSS Eval
842842
timeout_in_minutes: 60
@@ -1109,6 +1109,7 @@ steps:
11091109
commands:
11101110
- pytest -v -s tests/distributed/test_context_parallel.py
11111111
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
1112+
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
11121113

11131114
##### RL Integration Tests #####
11141115
- label: Prime-RL Integration Test # 15min

tests/compile/test_full_graph.py

Lines changed: 2 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,19 @@
33

44
from __future__ import annotations
55

6-
import itertools
7-
import logging
86
import tempfile
9-
from collections.abc import Iterable
10-
from typing import Any, Optional, Union
7+
from typing import Any, Union
118

129
import pytest
13-
import regex as re
1410
import torch
1511

1612
from tests.quantization.utils import is_quant_method_supported
1713
from vllm import LLM, SamplingParams
18-
from vllm.attention.backends.registry import _Backend
1914
from vllm.config import CompilationConfig, CompilationLevel, CUDAGraphMode, PassConfig
2015
from vllm.platforms import current_platform
2116
from vllm.utils import is_torch_equal_or_newer
22-
from vllm.utils.flashinfer import has_flashinfer
2317

24-
from ..utils import create_new_process_for_each_test, flat_product, multi_gpu_test
18+
from ..utils import create_new_process_for_each_test
2519

2620

2721
def models_list(*, all: bool = True, keywords: list[str] | None = None):
@@ -189,194 +183,6 @@ def test_fp8_kv_scale_compile(optimization_level: int):
189183
run_model(optimization_level, model, **model_kwargs)
190184

191185

192-
MODELS_FP8: list[tuple[str, dict[str, Any], _Backend]] = []
193-
MODELS_FP4: list[tuple[str, dict[str, Any], _Backend]] = []
194-
MODELS: list[tuple[str, dict[str, Any], _Backend]] = [] # tp-only
195-
196-
if current_platform.is_cuda():
197-
MODELS_FP8 += [
198-
(
199-
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
200-
{"max_model_len": 1024},
201-
_Backend.TRITON_ATTN,
202-
)
203-
]
204-
205-
if current_platform.is_device_capability((10, 0)):
206-
MODELS_FP8 += [
207-
(
208-
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
209-
{"kv_cache_dtype": "fp8", "max_model_len": 1024},
210-
_Backend.FLASHINFER,
211-
)
212-
]
213-
214-
MODELS_FP4 += [
215-
(
216-
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
217-
{"kv_cache_dtype": "fp8", "max_model_len": 1024},
218-
_Backend.FLASHINFER,
219-
)
220-
]
221-
222-
MODELS += [
223-
(
224-
"meta-llama/Llama-3.1-8B-Instruct",
225-
{"max_model_len": 1024},
226-
_Backend.FLASHINFER,
227-
)
228-
]
229-
230-
elif current_platform.is_rocm():
231-
MODELS_FP8 += [("amd/Llama-3.1-8B-Instruct-FP8-KV", {}, _Backend.TRITON_ATTN)]
232-
233-
INDUCTOR_GRAPH_PARTITION = (
234-
[True, False] if (is_torch_equal_or_newer("2.9.0.dev")) else [False]
235-
)
236-
237-
# TODO(luka) test both in nightly
238-
CUSTOM_OPS_FP8 = ["-quant_fp8"] # , "+quant_fp8"]
239-
240-
241-
@pytest.mark.parametrize(
242-
"model_name, model_kwargs, backend, custom_ops",
243-
# Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
244-
list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
245-
# quant_fp4 only has the custom impl
246-
+ list(flat_product(MODELS_FP4, [""])),
247-
)
248-
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
249-
def test_e2e_fusion_attn_quant(
250-
model_name: str,
251-
model_kwargs: dict[str, Any],
252-
backend: _Backend,
253-
custom_ops: str,
254-
inductor_graph_partition: bool,
255-
caplog_mp_spawn,
256-
monkeypatch,
257-
):
258-
custom_ops_list = custom_ops.split(",") if custom_ops else []
259-
260-
if inductor_graph_partition:
261-
mode = CUDAGraphMode.FULL_AND_PIECEWISE
262-
splitting_ops: Optional[list[str]] = None
263-
else:
264-
mode = CUDAGraphMode.FULL_DECODE_ONLY
265-
splitting_ops = []
266-
267-
# Disable, compile cache to make sure custom passes run.
268-
# Otherwise, we can't verify fusion happened through the logs.
269-
# Log capture also doesn't work with multiprocessing yet.
270-
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
271-
272-
# To capture subprocess logs, we need to know whether spawn or fork is used.
273-
# Force spawn as it is more general.
274-
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
275-
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
276-
277-
compilation_config = CompilationConfig(
278-
# Testing properties
279-
custom_ops=custom_ops_list,
280-
use_inductor_graph_partition=inductor_graph_partition,
281-
cudagraph_mode=mode,
282-
splitting_ops=splitting_ops,
283-
# Common
284-
level=CompilationLevel.PIECEWISE,
285-
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
286-
# Inductor caches custom passes by default as well via uuid
287-
inductor_compile_config={"force_disable_caches": True},
288-
)
289-
290-
with caplog_mp_spawn(logging.DEBUG) as log_holder:
291-
run_model(compilation_config, model_name, **model_kwargs)
292-
293-
assert "Fused quant onto 48 attention nodes" in log_holder.text, log_holder.text
294-
295-
296-
# TODO(luka) test both in nightly
297-
# TODO(luka) change to -
298-
CUSTOM_OPS_RMS_NORM = ["+rms_norm"] # , "+rms_norm"]
299-
300-
301-
def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
302-
for op_list in itertools.product(*custom_ops_lists):
303-
yield ",".join(op_list)
304-
305-
306-
@multi_gpu_test(num_gpus=2)
307-
@pytest.mark.parametrize(
308-
"model_name, model_kwargs, backend, custom_ops",
309-
# Toggle RMSNorm and QuantFP8 for FP8 models
310-
list(flat_product(MODELS_FP8, ["+quant_fp8,+rms_norm"]))
311-
# custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM))) # TODO
312-
# Toggle RMSNorm for FP4 models and unquant models
313-
+ list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
314-
)
315-
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
316-
@pytest.mark.skipif(
317-
not current_platform.is_cuda()
318-
or not has_flashinfer()
319-
or not current_platform.has_device_capability(90),
320-
reason="allreduce+rmsnorm fusion requires flashinfer",
321-
)
322-
def test_e2e_fusion_tp2_attn_quant_allreduce_rmsnorm(
323-
model_name,
324-
model_kwargs,
325-
backend,
326-
custom_ops: str,
327-
inductor_graph_partition: bool,
328-
caplog_mp_spawn,
329-
monkeypatch,
330-
):
331-
custom_ops_list = custom_ops.split(",") if custom_ops else []
332-
333-
if inductor_graph_partition:
334-
mode = CUDAGraphMode.FULL_AND_PIECEWISE
335-
splitting_ops: Optional[list[str]] = None
336-
else:
337-
mode = CUDAGraphMode.FULL_DECODE_ONLY
338-
splitting_ops = []
339-
340-
# Disable, compile cache to make sure custom passes run.
341-
# Otherwise, we can't verify fusion happened through the logs.
342-
# Log capture also doesn't work with multiprocessing yet.
343-
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
344-
345-
# To capture subprocess logs, we need to know whether spawn or fork is used.
346-
# Force spawn as it is more general.
347-
monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
348-
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
349-
350-
compilation_config = CompilationConfig(
351-
# Testing properties
352-
use_inductor_graph_partition=inductor_graph_partition,
353-
cudagraph_mode=mode,
354-
custom_ops=custom_ops_list,
355-
splitting_ops=splitting_ops,
356-
# Common
357-
level=CompilationLevel.PIECEWISE,
358-
pass_config=PassConfig(
359-
enable_attn_fusion=True,
360-
enable_noop=True,
361-
enable_fi_allreduce_fusion=True,
362-
),
363-
# Inductor caches custom passes by default as well via uuid
364-
inductor_compile_config={"force_disable_caches": True},
365-
)
366-
367-
with caplog_mp_spawn(logging.DEBUG) as log_holder:
368-
run_model(
369-
compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
370-
)
371-
372-
assert "Fused quant onto 48 attention nodes" in log_holder.text, log_holder.text
373-
374-
matches = re.findall(
375-
r"\[collective_fusion.py:\d+] Replaced 96 patterns", log_holder.text
376-
)
377-
assert len(matches) == 2, log_holder.text
378-
379-
380186
def run_model(
381187
compile_config: Union[int, CompilationConfig], model: str, **model_kwargs
382188
):

0 commit comments

Comments
 (0)