Skip to content

Commit 1c9b557

Browse files
youkaichaoAlvant
authored andcommitted
[ci][test] add correctness test for cpu offloading (vllm-project#6549)
Signed-off-by: Alvant <[email protected]>
1 parent 17ba77a commit 1c9b557

File tree

4 files changed

+105
-85
lines changed

4 files changed

+105
-85
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ steps:
4646
commands:
4747
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
4848
- pytest -v -s basic_correctness/test_basic_correctness.py
49+
- pytest -v -s basic_correctness/test_cpu_offload.py
4950
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
5051
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
5152
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from ..utils import compare_two_settings
2+
3+
4+
def test_cpu_offload():
5+
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
6+
["--cpu-offload-gb", "4"])
7+
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
8+
[], ["--cpu-offload-gb", "1"])
Lines changed: 2 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import pytest
2-
from transformers import AutoTokenizer
32

4-
from ..utils import RemoteOpenAIServer
3+
from ..utils import compare_two_settings
54

65

76
@pytest.mark.parametrize(
@@ -13,7 +12,6 @@
1312
(1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B"),
1413
])
1514
def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
16-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
1715

1816
pp_args = [
1917
# use half precision for speed and memory savings in CI environment
@@ -48,85 +46,4 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME):
4846
pp_args.append("--enforce-eager")
4947
tp_args.append("--enforce-eager")
5048

51-
prompt = "Hello, my name is"
52-
token_ids = tokenizer(prompt)["input_ids"]
53-
results = []
54-
for args in (pp_args, tp_args):
55-
with RemoteOpenAIServer(MODEL_NAME, args) as server:
56-
client = server.get_client()
57-
58-
# test models list
59-
models = client.models.list()
60-
models = models.data
61-
served_model = models[0]
62-
results.append({
63-
"test": "models_list",
64-
"id": served_model.id,
65-
"root": served_model.root,
66-
})
67-
68-
# test with text prompt
69-
completion = client.completions.create(model=MODEL_NAME,
70-
prompt=prompt,
71-
max_tokens=5,
72-
temperature=0.0)
73-
74-
results.append({
75-
"test": "single_completion",
76-
"text": completion.choices[0].text,
77-
"finish_reason": completion.choices[0].finish_reason,
78-
"usage": completion.usage,
79-
})
80-
81-
# test using token IDs
82-
completion = client.completions.create(
83-
model=MODEL_NAME,
84-
prompt=token_ids,
85-
max_tokens=5,
86-
temperature=0.0,
87-
)
88-
89-
results.append({
90-
"test": "token_ids",
91-
"text": completion.choices[0].text,
92-
"finish_reason": completion.choices[0].finish_reason,
93-
"usage": completion.usage,
94-
})
95-
96-
# test simple list
97-
batch = client.completions.create(
98-
model=MODEL_NAME,
99-
prompt=[prompt, prompt],
100-
max_tokens=5,
101-
temperature=0.0,
102-
)
103-
104-
results.append({
105-
"test": "simple_list",
106-
"text0": batch.choices[0].text,
107-
"text1": batch.choices[1].text,
108-
})
109-
110-
# test streaming
111-
batch = client.completions.create(
112-
model=MODEL_NAME,
113-
prompt=[prompt, prompt],
114-
max_tokens=5,
115-
temperature=0.0,
116-
stream=True,
117-
)
118-
texts = [""] * 2
119-
for chunk in batch:
120-
assert len(chunk.choices) == 1
121-
choice = chunk.choices[0]
122-
texts[choice.index] += choice.text
123-
results.append({
124-
"test": "streaming",
125-
"texts": texts,
126-
})
127-
128-
n = len(results) // 2
129-
pp_results = results[:n]
130-
tp_results = results[n:]
131-
for pp, tp in zip(pp_results, tp_results):
132-
assert pp == tp
49+
compare_two_settings(MODEL_NAME, pp_args, tp_args)

tests/utils.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import openai
1111
import ray
1212
import requests
13+
from transformers import AutoTokenizer
1314

1415
from vllm.distributed import (ensure_model_parallel_initialized,
1516
init_distributed_environment)
@@ -124,6 +125,99 @@ def get_async_client(self):
124125
)
125126

126127

128+
def compare_two_settings(model: str, arg1: List[str], arg2: List[str]):
129+
"""
130+
Launch API server with two different sets of arguments and compare the
131+
results of the API calls. The arguments are after the model name.
132+
"""
133+
134+
tokenizer = AutoTokenizer.from_pretrained(model)
135+
136+
prompt = "Hello, my name is"
137+
token_ids = tokenizer(prompt)["input_ids"]
138+
results = []
139+
for args in (arg1, arg2):
140+
with RemoteOpenAIServer(model, args) as server:
141+
client = server.get_client()
142+
143+
# test models list
144+
models = client.models.list()
145+
models = models.data
146+
served_model = models[0]
147+
results.append({
148+
"test": "models_list",
149+
"id": served_model.id,
150+
"root": served_model.root,
151+
})
152+
153+
# test with text prompt
154+
completion = client.completions.create(model=model,
155+
prompt=prompt,
156+
max_tokens=5,
157+
temperature=0.0)
158+
159+
results.append({
160+
"test": "single_completion",
161+
"text": completion.choices[0].text,
162+
"finish_reason": completion.choices[0].finish_reason,
163+
"usage": completion.usage,
164+
})
165+
166+
# test using token IDs
167+
completion = client.completions.create(
168+
model=model,
169+
prompt=token_ids,
170+
max_tokens=5,
171+
temperature=0.0,
172+
)
173+
174+
results.append({
175+
"test": "token_ids",
176+
"text": completion.choices[0].text,
177+
"finish_reason": completion.choices[0].finish_reason,
178+
"usage": completion.usage,
179+
})
180+
181+
# test simple list
182+
batch = client.completions.create(
183+
model=model,
184+
prompt=[prompt, prompt],
185+
max_tokens=5,
186+
temperature=0.0,
187+
)
188+
189+
results.append({
190+
"test": "simple_list",
191+
"text0": batch.choices[0].text,
192+
"text1": batch.choices[1].text,
193+
})
194+
195+
# test streaming
196+
batch = client.completions.create(
197+
model=model,
198+
prompt=[prompt, prompt],
199+
max_tokens=5,
200+
temperature=0.0,
201+
stream=True,
202+
)
203+
texts = [""] * 2
204+
for chunk in batch:
205+
assert len(chunk.choices) == 1
206+
choice = chunk.choices[0]
207+
texts[choice.index] += choice.text
208+
results.append({
209+
"test": "streaming",
210+
"texts": texts,
211+
})
212+
213+
n = len(results) // 2
214+
arg1_results = results[:n]
215+
arg2_results = results[n:]
216+
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
217+
assert arg1_result == arg2_result, \
218+
f"Results for {model=} are not the same with {arg1=} and {arg2=}"
219+
220+
127221
def init_test_distributed_environment(
128222
tp_size: int,
129223
pp_size: int,

0 commit comments

Comments
 (0)