Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
4854110
set_current_metadata
youkaichao Sep 29, 2024
02929ef
add unified_flash_attention
youkaichao Sep 29, 2024
383b51c
expose attention_backend from attention metadata
youkaichao Sep 29, 2024
861a65e
init draft
youkaichao Sep 29, 2024
d751293
finish
youkaichao Sep 29, 2024
dc5e931
warning for overwritten config
youkaichao Sep 29, 2024
558ea39
unify flags
youkaichao Sep 29, 2024
1074d7a
fix code
youkaichao Sep 29, 2024
6f65ec5
store forward context
youkaichao Sep 29, 2024
e6c21c7
fix
youkaichao Sep 29, 2024
ae97d2c
fix
youkaichao Sep 29, 2024
2b4fe53
get symint
youkaichao Sep 29, 2024
a6f0e3b
fix bugs
youkaichao Sep 29, 2024
99a281e
fix the rest
youkaichao Sep 29, 2024
44328eb
fix tpu
youkaichao Sep 29, 2024
500430b
leave todo
youkaichao Sep 29, 2024
5b50c68
add tests
youkaichao Sep 29, 2024
55d54fe
run 3 tests
youkaichao Sep 29, 2024
954caf8
rename
youkaichao Sep 29, 2024
ee2100e
support pp
youkaichao Sep 29, 2024
b5fc0f1
move to decorators
youkaichao Sep 29, 2024
246e6e5
fix mro
youkaichao Sep 29, 2024
49aa7cc
add comments
youkaichao Sep 29, 2024
99144b3
fix mutates_args
youkaichao Sep 29, 2024
6ae09bd
fix forward context
youkaichao Sep 29, 2024
ec2191f
surface errors
youkaichao Sep 29, 2024
889794e
fix more
youkaichao Sep 29, 2024
ed80d67
fix spec decode
youkaichao Sep 29, 2024
2b0c543
complicated bug, thank you chatgpt
youkaichao Sep 29, 2024
ca79dd5
simplification, model runner set context, model does not
youkaichao Sep 29, 2024
fad55cb
fix tests
youkaichao Sep 30, 2024
e195841
add compare_all_settings
youkaichao Sep 30, 2024
fbd3231
change tests
youkaichao Sep 30, 2024
4781c14
repurpose smoke tests
youkaichao Sep 30, 2024
cbc9229
remove
youkaichao Sep 30, 2024
5970a6f
restore
youkaichao Sep 30, 2024
ca587a8
restore
youkaichao Sep 30, 2024
7ea321c
restore
youkaichao Sep 30, 2024
f3a5a5e
fix for pp
youkaichao Sep 30, 2024
a864475
add tests
youkaichao Sep 30, 2024
1d9aacd
rename
youkaichao Sep 30, 2024
f233087
update tests
youkaichao Sep 30, 2024
d2f1b97
prepare for tp test
youkaichao Sep 30, 2024
1b8ee5a
early error
youkaichao Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ steps:
- vllm/core/
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py

Expand Down Expand Up @@ -218,7 +220,7 @@ steps:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py
- pytest -v -s compile/test_basic_correctness.py

- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
Expand Down Expand Up @@ -382,7 +384,7 @@ steps:
- tests/distributed/
- vllm/compilation
commands:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
Expand Down
28 changes: 28 additions & 0 deletions tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Dict, List, Optional

import pytest

from vllm.utils import cuda_device_count_stateless

from ..utils import compare_all_settings
from .utils import TEST_MODELS_SMOKE


@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
@pytest.mark.parametrize("pp_size", [1, 2])
@pytest.mark.parametrize("tp_size", [1])
def test_compile_correctness(model_info, pp_size, tp_size):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.")
model = model_info[0]
model_args = model_info[1]
all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
+ ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
all_envs: List[Optional[Dict[str, str]]] = [{
"VLLM_TORCH_COMPILE_LEVEL":
str(i)
} for i in range(3)]
compare_all_settings(model, all_args, all_envs)
13 changes: 8 additions & 5 deletions tests/compile/test_full_graph.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import pytest

from vllm.compilation.backends import vllm_backend

from ..utils import fork_new_process_for_each_test
from .utils import TEST_MODELS, check_full_graph_support


@pytest.mark.parametrize("model_info", TEST_MODELS)
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
def test_full_graph(model_info, backend):
@pytest.mark.parametrize("optimization_level", [1, 2])
@fork_new_process_for_each_test
def test_full_graph(model_info, optimization_level):
model = model_info[0]
model_kwargs = model_info[1]
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1)
22 changes: 0 additions & 22 deletions tests/compile/test_full_graph_multi_gpu.py

This file was deleted.

13 changes: 0 additions & 13 deletions tests/compile/test_full_graph_smoke.py

This file was deleted.

22 changes: 10 additions & 12 deletions tests/compile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,12 @@

from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.plugins import set_torch_compile_backend
from vllm.utils import is_hip

TEST_MODELS_SMOKE = [
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
["--quantization", "compressed-tensors"]),
("meta-llama/Meta-Llama-3-8B", []),
]

TEST_MODELS = [
Expand Down Expand Up @@ -68,20 +66,20 @@
}))


def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
def check_full_graph_support(model,
model_kwargs,
optimization_level,
tp_size=1):
# make sure these models can be captured in full graph mode
if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"

# Inductor doesn't support fp8/gptq_marlin_24 yet.
quantization = model_kwargs.get("quantization")
if (quantization == "fp8" or quantization == "gptq_marlin"
or quantization == "gptq_marlin_24") and backend != "eager":
or quantization == "gptq_marlin_24") and optimization_level > 1:
return

set_torch_compile_backend(backend)

prompts = [
"Hello, my name is",
"The president of the United States is",
Expand Down
111 changes: 76 additions & 35 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,18 +180,34 @@ def compare_two_settings(model: str,
env1: The first set of environment variables to pass to the API server.
env2: The second set of environment variables to pass to the API server.
"""
compare_all_settings(model, [arg1, arg2], [env1, env2], max_wait_seconds)


def compare_all_settings(model: str,
all_args: List[List[str]],
all_envs: List[Optional[Dict[str, str]]],
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with several different sets of arguments/environments
and compare the results of the API calls with the first set of arguments.

Args:
model: The model to test.
all_args: A list of argument lists to pass to the API server.
all_envs: A list of environment dictionaries to pass to the API server.
"""
trust_remote_code = "--trust-remote-code"
if trust_remote_code in arg1 or trust_remote_code in arg2:
if any(trust_remote_code in args for args in all_args):
tokenizer = AutoTokenizer.from_pretrained(model,
trust_remote_code=True)
else:
tokenizer = AutoTokenizer.from_pretrained(model)

prompt = "Hello, my name is"
token_ids = tokenizer(prompt)["input_ids"]
results = []
for args, env in ((arg1, env1), (arg2, env2)):
ref_results: List = []
for i, (args, env) in enumerate(zip(all_args, all_envs)):
compare_results: List = []
with RemoteOpenAIServer(model,
args,
env_dict=env,
Expand All @@ -202,10 +218,13 @@ def compare_two_settings(model: str,
models = client.models.list()
models = models.data
served_model = models[0]
results.append({
"test": "models_list",
"id": served_model.id,
"root": served_model.root,
(ref_results if i == 0 else compare_results).append({
"test":
"models_list",
"id":
served_model.id,
"root":
served_model.root,
})

# test with text prompt
Expand All @@ -214,11 +233,15 @@ def compare_two_settings(model: str,
max_tokens=5,
temperature=0.0)

results.append({
"test": "single_completion",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
(ref_results if i == 0 else compare_results).append({
"test":
"single_completion",
"text":
completion.choices[0].text,
"finish_reason":
completion.choices[0].finish_reason,
"usage":
completion.usage,
})

# test using token IDs
Expand All @@ -229,11 +252,15 @@ def compare_two_settings(model: str,
temperature=0.0,
)

results.append({
"test": "token_ids",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
(ref_results if i == 0 else compare_results).append({
"test":
"token_ids",
"text":
completion.choices[0].text,
"finish_reason":
completion.choices[0].finish_reason,
"usage":
completion.usage,
})

# test seeded random sampling
Expand All @@ -243,11 +270,15 @@ def compare_two_settings(model: str,
seed=33,
temperature=1.0)

results.append({
"test": "seeded_sampling",
"text": completion.choices[0].text,
"finish_reason": completion.choices[0].finish_reason,
"usage": completion.usage,
(ref_results if i == 0 else compare_results).append({
"test":
"seeded_sampling",
"text":
completion.choices[0].text,
"finish_reason":
completion.choices[0].finish_reason,
"usage":
completion.usage,
})

# test seeded random sampling with multiple prompts
Expand All @@ -257,7 +288,7 @@ def compare_two_settings(model: str,
seed=33,
temperature=1.0)

results.append({
(ref_results if i == 0 else compare_results).append({
"test":
"seeded_sampling",
"text": [choice.text for choice in completion.choices],
Expand All @@ -275,10 +306,13 @@ def compare_two_settings(model: str,
temperature=0.0,
)

results.append({
"test": "simple_list",
"text0": batch.choices[0].text,
"text1": batch.choices[1].text,
(ref_results if i == 0 else compare_results).append({
"test":
"simple_list",
"text0":
batch.choices[0].text,
"text1":
batch.choices[1].text,
})

# test streaming
Expand All @@ -294,18 +328,25 @@ def compare_two_settings(model: str,
assert len(chunk.choices) == 1
choice = chunk.choices[0]
texts[choice.index] += choice.text
results.append({
(ref_results if i == 0 else compare_results).append({
"test": "streaming",
"texts": texts,
})

n = len(results) // 2
arg1_results = results[:n]
arg2_results = results[n:]
for arg1_result, arg2_result in zip(arg1_results, arg2_results):
assert arg1_result == arg2_result, (
f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
f"{arg1_result=} != {arg2_result=}")
if i > 0:
# if any setting fails, raise an error early
ref_args = all_args[0]
ref_envs = all_envs[0]
compare_args = all_args[i]
compare_envs = all_envs[i]
for ref_result, compare_result in zip(ref_results,
compare_results):
assert ref_result == compare_result, (
f"Results for {model=} are not the same.\n"
f"{ref_args=} {ref_envs=}\n"
f"{compare_args=} {compare_envs=}\n"
f"{ref_result=}\n"
f"{compare_result=}\n")


def init_test_distributed_environment(
Expand Down
Loading
Loading