Skip to content

Commit ec0ff9f

Browse files
authored
[V0 deprecation] Remove V0 CPU (#20437)
Signed-off-by: jiang1.li <[email protected]>
1 parent c139974 commit ec0ff9f

File tree

9 files changed

+811
-774
lines changed

9 files changed

+811
-774
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,16 @@ function cpu_tests() {
4848
# Run basic model test
4949
docker exec cpu-test-"$NUMA_NODE" bash -c "
5050
set -e
51-
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
52-
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
53-
pytest -v -s tests/models/language/generation -m cpu_model
54-
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
51+
# Note: disable until supports V1
52+
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
53+
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
54+
55+
# Note: disable Bart until supports V1
56+
pytest -v -s tests/models/language/generation -m cpu_model \
57+
--ignore=tests/models/language/generation/test_bart.py
58+
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
59+
--ignore=tests/models/language/generation/test_bart.py
60+
5561
pytest -v -s tests/models/language/pooling -m cpu_model
5662
pytest -v -s tests/models/multimodal/generation \
5763
--ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -62,21 +68,15 @@ function cpu_tests() {
6268
docker exec cpu-test-"$NUMA_NODE" bash -c "
6369
set -e
6470
pytest -s -v \
65-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
66-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
71+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
6772

73+
# Note: disable it until supports V1
6874
# Run AWQ test
6975
# docker exec cpu-test-"$NUMA_NODE" bash -c "
7076
# set -e
7177
# VLLM_USE_V1=0 pytest -s -v \
7278
# tests/quantization/test_ipex_quant.py"
7379

74-
# Run chunked-prefill and prefix-cache test
75-
docker exec cpu-test-"$NUMA_NODE" bash -c "
76-
set -e
77-
pytest -s -v -k cpu_model \
78-
tests/basic_correctness/test_chunked_prefill.py"
79-
8080
# online serving
8181
docker exec cpu-test-"$NUMA_NODE" bash -c "
8282
set -e

tests/models/language/generation/test_common.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
[
4040
pytest.param(
4141
"bigscience/bloom-560m", # bloom - testing alibi slopes
42-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
42+
marks=[pytest.mark.core_model],
4343
),
4444
pytest.param(
4545
"openai-community/gpt2", # gpt2
@@ -87,7 +87,11 @@
8787
pytest.param("bigcode/starcoder2-3b"), # starcoder2
8888
pytest.param(
8989
"TitanML/tiny-mixtral", # mixtral
90-
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
90+
marks=[pytest.mark.core_model],
91+
),
92+
pytest.param(
93+
"Qwen/Qwen1.5-MoE-A2.7B-Chat",
94+
marks=[pytest.mark.cpu_model],
9195
)
9296
])
9397
@pytest.mark.parametrize("max_tokens", [32])

tests/models/language/pooling/test_embedding.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
import os
43

54
import pytest
65

@@ -28,20 +27,24 @@ def v1(run_with_both_engines):
2827
# [Decoder-only]
2928
pytest.param("BAAI/bge-multilingual-gemma2",
3029
marks=[pytest.mark.core_model]),
31-
pytest.param("intfloat/e5-mistral-7b-instruct",
32-
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
30+
pytest.param(
31+
"intfloat/e5-mistral-7b-instruct",
32+
# CPU v1 doesn't support sliding window
33+
marks=[pytest.mark.core_model]),
3334
# the qwen models interfere with each other (see PR
3435
# https://github.com/vllm-project/vllm/pull/18720).
3536
# To avoid this problem, for now we skip v0 since it will be
3637
# deprecated anyway.
3738
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
3839
marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
3940
# [Encoder-only]
40-
pytest.param("BAAI/bge-base-en-v1.5",
41-
marks=[
42-
pytest.mark.core_model, pytest.mark.cpu_model,
43-
pytest.mark.skip_v1
44-
]),
41+
pytest.param(
42+
"BAAI/bge-base-en-v1.5",
43+
marks=[
44+
# CPU only supports V1
45+
pytest.mark.core_model,
46+
pytest.mark.skip_v1
47+
]),
4548
pytest.param("sentence-transformers/all-MiniLM-L12-v2",
4649
marks=[pytest.mark.skip_v1]),
4750
pytest.param("intfloat/multilingual-e5-small",
@@ -60,10 +63,6 @@ def test_models(
6063
model,
6164
monkeypatch,
6265
) -> None:
63-
if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
64-
) and os.environ.get("VLLM_USE_V1", "0") == "1":
65-
pytest.skip("CPU V1 doesn't support sliding window")
66-
6766
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
6867
# ROCm Triton FA does not currently support sliding window attention
6968
# switch to use ROCm CK FA backend

tests/models/language/pooling/test_reward.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import os
4+
35
import pytest
46
import torch
57
import torch.nn.functional as F
@@ -84,6 +86,9 @@ def test_prm_models(
8486
dtype: str,
8587
monkeypatch,
8688
) -> None:
89+
if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
90+
pytest.skip("CPU only supports V1")
91+
8792
if current_platform.is_rocm():
8893
# ROCm Triton FA does not currently support sliding window attention
8994
# switch to use ROCm CK FA backend

tests/quantization/test_compressed_tensors.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ def use_v0_only(monkeypatch):
4545
"""
4646
This module relies on V0 internals, so set VLLM_USE_V1=0.
4747
"""
48-
monkeypatch.setenv('VLLM_USE_V1', '0')
48+
if not current_platform.is_cpu():
49+
monkeypatch.setenv('VLLM_USE_V1', '0')
4950

5051

5152
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)