Skip to content

Commit 577cbda

Browse files
Implement integration tests in CI pipeline (#639)
Signed-off-by: dennis yeh <[email protected]> Co-authored-by: dennis yeh <[email protected]>
1 parent a94ae6e commit 577cbda

File tree

5 files changed

+242
-8
lines changed

5 files changed

+242
-8
lines changed

.buildkite/scripts/run_in_docker.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ if [ "$#" -eq 0 ]; then
1111
exit 1
1212
fi
1313

14+
ENV_VARS=(
15+
-e TEST_MODEL="$TEST_MODEL"
16+
-e MINIMUM_ACCURACY_THRESHOLD="$MINIMUM_ACCURACY_THRESHOLD"
17+
-e TENSOR_PARALLEL_SIZE="$TENSOR_PARALLEL_SIZE"
18+
)
19+
1420
if ! grep -q "^HF_TOKEN=" /etc/environment; then
1521
gcloud secrets versions access latest --secret=bm-agent-hf-token --quiet | \
1622
sudo tee -a /etc/environment > /dev/null <<< "HF_TOKEN=$(cat)"
@@ -76,14 +82,16 @@ docker builder prune -f
7682

7783
echo "Cleanup complete."
7884

79-
docker build --no-cache -f docker/Dockerfile -t "vllm-tpu:${BUILDKITE_COMMIT}" .
85+
IMAGE_NAME="vllm-tpu"
86+
docker build --no-cache -f docker/Dockerfile -t "${IMAGE_NAME}:${BUILDKITE_COMMIT}" .
8087

8188
exec docker run \
8289
--privileged \
8390
--net host \
8491
--shm-size=16G \
8592
--rm \
8693
-v "$LOCAL_HF_HOME":"$DOCKER_HF_HOME" \
94+
"${ENV_VARS[@]}" \
8795
-e HF_HOME="$DOCKER_HF_HOME" \
8896
-e MODEL_IMPL_TYPE="$MODEL_IMPL_TYPE" \
8997
-e HF_TOKEN="$HF_TOKEN" \
@@ -96,5 +104,5 @@ exec docker run \
96104
${JAX_RANDOM_WEIGHTS:+-e JAX_RANDOM_WEIGHTS="$JAX_RANDOM_WEIGHTS"} \
97105
${SKIP_ACCURACY_TESTS:+-e SKIP_ACCURACY_TESTS="$SKIP_ACCURACY_TESTS"} \
98106
${VLLM_MLA_DISABLE:+-e VLLM_MLA_DISABLE="$VLLM_MLA_DISABLE"} \
99-
"vllm-tpu:${BUILDKITE_COMMIT}" \
107+
"${IMAGE_NAME}:${BUILDKITE_COMMIT}" \
100108
"$@" # Pass all script arguments as the command to run in the container

docker/Dockerfile

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@ RUN VLLM_TARGET_DEVICE="tpu" pip install -e .
2020

2121
# Install test dependencies
2222
RUN python3 -m pip install -e tests/vllm_test_utils
23-
RUN python3 -m pip install --no-cache-dir git+https://github.com/thuml/depyf.git pytest pytest-asyncio tpu-info datasets 'lm_eval[api]==0.4.4'
24-
RUN python3 -m pip install pytest-cov
25-
RUN python3 -m pip install numba
23+
RUN python3 -m pip install --no-cache-dir \
24+
git+https://github.com/thuml/depyf.git \
25+
pytest-asyncio \
26+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] \
27+
pytest-cov \
28+
tblib \
29+
numba
2630

2731
# Install tpu_commons
2832
WORKDIR /workspace/tpu_commons
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
def pytest_addoption(parser):
2+
"""Adds custom command-line options to pytest."""
3+
parser.addoption("--tensor-parallel-size",
4+
type=int,
5+
default=1,
6+
help="The tensor parallel size to use for the test.")
7+
parser.addoption(
8+
"--expected-value",
9+
type=float,
10+
default=None,
11+
help=
12+
"This value will be used to compare the measure value and determine if the test passes or fails."
13+
)
14+
parser.addoption("--model-name",
15+
type=str,
16+
default=None,
17+
help="Model name to test (e.g., 'model1')")
18+
parser.addoption("--fp8-kv-model-name",
19+
type=str,
20+
default=None,
21+
help="Model name to test fp8-kv (e.g., 'model1')")
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copied from vLLM: https://github.com/vllm-project/vllm/blob/839ab00/tests/entrypoints/llm/test_accuracy.py
2+
3+
# SPDX-License-Identifier: Apache-2.0
4+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
5+
"""
6+
This file test accuracy of the vLLM server via LMEval.
7+
It uses local-completions, which interacts with vLLM
8+
through the OAI API with N concurrent connections.
9+
This simulates real work usage of the API and makes
10+
sure that the zmq frontend mp RPC message passing and
11+
AsyncLLMEngine are working correctly.
12+
"""
13+
14+
import threading
15+
16+
import lm_eval
17+
import pytest
18+
from vllm.platforms import current_platform
19+
20+
MODEL_NAMES = []
21+
FP8_KV_MODEL_NAMES = []
22+
NUM_CONCURRENT = 500
23+
TASK = "gsm8k"
24+
FILTER = "exact_match,strict-match"
25+
RTOL = 0.03
26+
_JSON_WRITE_LOCK = threading.Lock()
27+
28+
29+
def run_test(model_name, expected_value, more_args=None):
30+
"""Run the end to end accuracy test."""
31+
print(f"Running test for model: {model_name}")
32+
33+
model_args = f"pretrained={model_name},max_model_len=4096"
34+
if more_args is not None:
35+
model_args = "{},{}".format(model_args, more_args)
36+
37+
results = lm_eval.simple_evaluate(
38+
model="vllm",
39+
model_args=model_args,
40+
tasks="gsm8k",
41+
batch_size="auto",
42+
)
43+
44+
measured_value = results["results"][TASK][FILTER]
45+
assert (measured_value - RTOL < expected_value < measured_value +
46+
RTOL), f"Expected: {expected_value} | Measured: {measured_value}"
47+
48+
49+
@pytest.mark.skipif(not current_platform.is_cuda()
50+
and not current_platform.is_tpu(),
51+
reason="V1 is currently only supported on CUDA and TPU")
52+
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch,
53+
request: pytest.FixtureRequest):
54+
"""Run with the V1 Engine."""
55+
model = request.config.getoption("--model-name")
56+
print(f"Testing model: {model}...")
57+
58+
tp_size = request.config.getoption("--tensor-parallel-size")
59+
expected_value = request.config.getoption("--expected-value")
60+
61+
if expected_value is None:
62+
raise ValueError
63+
64+
if tp_size is None:
65+
tp_size = 1
66+
elif tp_size < 1 or tp_size > 8:
67+
raise ValueError
68+
69+
with monkeypatch.context() as m:
70+
m.setenv("VLLM_USE_V1", "1")
71+
72+
more_args = None
73+
if current_platform.is_tpu():
74+
more_args = "max_model_len=2048,max_num_seqs=64"
75+
tp_size_str = f"tensor_parallel_size={tp_size}"
76+
more_args += ",{}".format(tp_size_str)
77+
78+
print(f"common args: {more_args}")
79+
80+
run_test(model, expected_value, more_args)
81+
82+
83+
@pytest.mark.skipif(not current_platform.is_cuda()
84+
and not current_platform.is_tpu(),
85+
reason="V1 is currently only supported on CUDA and TPU")
86+
def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
87+
monkeypatch: pytest.MonkeyPatch, request: pytest.FixtureRequest):
88+
"""Run with the V1 Engine."""
89+
fp8_kv_model = request.config.getoption("--fp8-kv-model-name")
90+
print(f"Testing fp8_kv_model: {fp8_kv_model}...")
91+
92+
tp_size = request.config.getoption("--tensor-parallel-size")
93+
expected_value = request.config.getoption("--expected-value")
94+
95+
if expected_value is None:
96+
raise ValueError
97+
98+
if tp_size is None:
99+
tp_size = 1
100+
elif tp_size < 1 or tp_size > 8:
101+
raise ValueError
102+
103+
with monkeypatch.context() as m:
104+
m.setenv("VLLM_USE_V1", "1")
105+
106+
more_args = None
107+
if current_platform.is_tpu():
108+
more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"
109+
tp_size_str = f"tensor_parallel_size={tp_size}"
110+
more_args += ",{}".format(tp_size_str)
111+
112+
print(f"common args: {more_args}")
113+
114+
run_test(fp8_kv_model, expected_value, more_args)
Lines changed: 90 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,91 @@
1-
#!/bin/sh
1+
#!/bin/bash
22

3-
# TODO : to be added by https://github.com/vllm-project/tpu_commons/pull/639
4-
echo "[placeholder] accuracy test passed"
3+
test_model=""
4+
tensor_parallel_size=1
5+
minimum_accuracy_threshold=0
6+
7+
extra_serve_args=()
8+
echo extra_serve_args: "${extra_serve_args[@]}"
9+
10+
root_dir=/workspace
11+
exit_code=0
12+
13+
helpFunction()
14+
{
15+
echo ""
16+
echo "Usage: $0 [-r full_path_to_root_dir -m model_id]"
17+
echo -e "\t-r The path your root directory containing both 'vllm' and 'tpu_commons' (default: /workspace/, which is used in the Dockerfile)"
18+
exit 1
19+
}
20+
21+
while [[ "$#" -gt 0 ]]; do
22+
case "$1" in
23+
-r|--root-dir-path)
24+
root_dir="$2"
25+
shift
26+
shift
27+
;;
28+
-h|--help)
29+
helpFunction
30+
;;
31+
*) # unknown option
32+
echo "Unknown option: $1"
33+
helpFunction
34+
;;
35+
esac
36+
done
37+
38+
if [ -n "$TEST_MODEL" ]; then
39+
test_model="$TEST_MODEL"
40+
fi
41+
42+
if [ -n "$MINIMUM_ACCURACY_THRESHOLD" ]; then
43+
minimum_accuracy_threshold="$MINIMUM_ACCURACY_THRESHOLD"
44+
fi
45+
46+
if [ -n "$TENSOR_PARALLEL_SIZE" ]; then
47+
tensor_parallel_size="$TENSOR_PARALLEL_SIZE"
48+
fi
49+
50+
# Check if test_model is provided and not empty
51+
if [[ -z "$test_model" ]]; then
52+
echo "Error: Test model name (-m) is a required argument." >&2
53+
has_error=1
54+
fi
55+
56+
# Check if tensor_parallel_size is an integer and greater than 0
57+
if ! [[ "$tensor_parallel_size" =~ ^[1-9][0-9]*$ ]]; then
58+
echo "Error: Tensor parallel size (-t) must be an integer greater than 0. Got: '$tensor_parallel_size'" >&2
59+
has_error=1
60+
fi
61+
62+
# Check if minimum_accuracy_threshold is a float and greater than 0
63+
if ! awk -v num="$minimum_accuracy_threshold" 'BEGIN { exit !(num > 0) }'; then
64+
echo "Error: Minimum accuracy threshold (-e) must be a number greater than 0. Got: '$minimum_accuracy_threshold'" >&2
65+
has_error=1
66+
fi
67+
68+
# If any validation failed, print help and exit
69+
if [[ "$has_error" -ne 0 ]]; then
70+
helpFunction
71+
fi
72+
73+
74+
echo "Using the root directory at $root_dir"
75+
76+
cd "$root_dir"/vllm/tests/entrypoints/llm || exit
77+
78+
# Overwrite a few of the vLLM benchmarking scripts with the TPU Commons ones
79+
cp "$root_dir"/tpu_commons/scripts/vllm/integration/*.py "$root_dir"/vllm/tests/entrypoints/llm/
80+
81+
echo "--------------------------------------------------"
82+
echo "Running integration for model: $test_model"
83+
echo "--------------------------------------------------"
84+
85+
# Default action
86+
python -m pytest -rP test_accuracy.py::test_lm_eval_accuracy_v1_engine \
87+
--tensor-parallel-size="$tensor_parallel_size" \
88+
--model-name="$test_model" \
89+
--expected-value="$minimum_accuracy_threshold"
90+
91+
exit $exit_code

0 commit comments

Comments
 (0)