vllm-project · youkaichao · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -110,7 +110,9 @@ steps:
   - vllm/core/
   - tests/distributed
   - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile
   commands:
+  - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 
@@ -218,7 +220,7 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph_smoke.py
+  - pytest -v -s compile/test_basic_correctness.py
 
 - label: "PyTorch Fullgraph Test" # 18min
   source_file_dependencies:
@@ -382,7 +384,7 @@ steps:
   - tests/distributed/
   - vllm/compilation
   commands:
-  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
@@ -0,0 +1,28 @@
+from typing import Dict, List, Optional
+
+import pytest
+
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import compare_all_settings
+from .utils import TEST_MODELS_SMOKE
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
+@pytest.mark.parametrize("pp_size", [1, 2])
+@pytest.mark.parametrize("tp_size", [1])
+def test_compile_correctness(model_info, pp_size, tp_size):
+    # this test is run under multiple suits, with different GPUs.
+    # make sure we only run the test with correct CUDA devices.
+    # don't use "<", as it will duplicate the tests.
+    if cuda_device_count_stateless() != pp_size * tp_size:
+        pytest.skip("Not correct CUDA devices for the test.")
+    model = model_info[0]
+    model_args = model_info[1]
+    all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
+                + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
+    all_envs: List[Optional[Dict[str, str]]] = [{
+        "VLLM_TORCH_COMPILE_LEVEL":
+        str(i)
+    } for i in range(3)]
+    compare_all_settings(model, all_args, all_envs)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
@@ -1,13 +1,16 @@
 import pytest
 
-from vllm.compilation.backends import vllm_backend
-
+from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
 
 
 @pytest.mark.parametrize("model_info", TEST_MODELS)
-@pytest.mark.parametrize("backend", ["eager", vllm_backend])
-def test_full_graph(model_info, backend):
+@pytest.mark.parametrize("optimization_level", [1, 2])
+@fork_new_process_for_each_test
+def test_full_graph(model_info, optimization_level):
     model = model_info[0]
     model_kwargs = model_info[1]
-    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
+    check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1)
diff --git a/tests/compile/test_full_graph_multi_gpu.py b/tests/compile/test_full_graph_multi_gpu.py
diff --git a/tests/compile/test_full_graph_smoke.py b/tests/compile/test_full_graph_smoke.py
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
@@ -4,14 +4,12 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.plugins import set_torch_compile_backend
 from vllm.utils import is_hip
 
 TEST_MODELS_SMOKE = [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
+     ["--quantization", "compressed-tensors"]),
+    ("meta-llama/Meta-Llama-3-8B", []),
 ]
 
 TEST_MODELS = [
@@ -68,20 +66,20 @@
     }))
 
 
-def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
+def check_full_graph_support(model,
+                             model_kwargs,
+                             optimization_level,
+                             tp_size=1):
     # make sure these models can be captured in full graph mode
-    if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
-        os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
-        os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
+    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
     # Inductor doesn't support fp8/gptq_marlin_24 yet.
     quantization = model_kwargs.get("quantization")
     if (quantization == "fp8" or quantization == "gptq_marlin"
-            or quantization == "gptq_marlin_24") and backend != "eager":
+            or quantization == "gptq_marlin_24") and optimization_level > 1:
         return
 
-    set_torch_compile_backend(backend)
-
     prompts = [
         "Hello, my name is",
         "The president of the United States is",

diff --git a/tests/utils.py b/tests/utils.py
@@ -180,18 +180,34 @@ def compare_two_settings(model: str,
         env1: The first set of environment variables to pass to the API server.
         env2: The second set of environment variables to pass to the API server.
     """
+    compare_all_settings(model, [arg1, arg2], [env1, env2], max_wait_seconds)
 
+
+def compare_all_settings(model: str,
+                         all_args: List[List[str]],
+                         all_envs: List[Optional[Dict[str, str]]],
+                         max_wait_seconds: Optional[float] = None) -> None:
+    """
+    Launch API server with several different sets of arguments/environments
+    and compare the results of the API calls with the first set of arguments.
+
+    Args:
+        model: The model to test.
+        all_args: A list of argument lists to pass to the API server.
+        all_envs: A list of environment dictionaries to pass to the API server.
+    """
     trust_remote_code = "--trust-remote-code"
-    if trust_remote_code in arg1 or trust_remote_code in arg2:
+    if any(trust_remote_code in args for args in all_args):
         tokenizer = AutoTokenizer.from_pretrained(model,
                                                   trust_remote_code=True)
     else:
         tokenizer = AutoTokenizer.from_pretrained(model)
 
     prompt = "Hello, my name is"
     token_ids = tokenizer(prompt)["input_ids"]
-    results = []
-    for args, env in ((arg1, env1), (arg2, env2)):
+    ref_results: List = []
+    for i, (args, env) in enumerate(zip(all_args, all_envs)):
+        compare_results: List = []
         with RemoteOpenAIServer(model,
                                 args,
                                 env_dict=env,
@@ -202,10 +218,13 @@ def compare_two_settings(model: str,
             models = client.models.list()
             models = models.data
             served_model = models[0]
-            results.append({
-                "test": "models_list",
-                "id": served_model.id,
-                "root": served_model.root,
+            (ref_results if i == 0 else compare_results).append({
+                "test":
+                "models_list",
+                "id":
+                served_model.id,
+                "root":
+                served_model.root,
             })
 
             # test with text prompt
@@ -214,11 +233,15 @@ def compare_two_settings(model: str,
                                                    max_tokens=5,
                                                    temperature=0.0)
 
-            results.append({
-                "test": "single_completion",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
+            (ref_results if i == 0 else compare_results).append({
+                "test":
+                "single_completion",
+                "text":
+                completion.choices[0].text,
+                "finish_reason":
+                completion.choices[0].finish_reason,
+                "usage":
+                completion.usage,
             })
 
             # test using token IDs
@@ -229,11 +252,15 @@ def compare_two_settings(model: str,
                 temperature=0.0,
             )
 
-            results.append({
-                "test": "token_ids",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
+            (ref_results if i == 0 else compare_results).append({
+                "test":
+                "token_ids",
+                "text":
+                completion.choices[0].text,
+                "finish_reason":
+                completion.choices[0].finish_reason,
+                "usage":
+                completion.usage,
             })
 
             # test seeded random sampling
@@ -243,11 +270,15 @@ def compare_two_settings(model: str,
                                                    seed=33,
                                                    temperature=1.0)
 
-            results.append({
-                "test": "seeded_sampling",
-                "text": completion.choices[0].text,
-                "finish_reason": completion.choices[0].finish_reason,
-                "usage": completion.usage,
+            (ref_results if i == 0 else compare_results).append({
+                "test":
+                "seeded_sampling",
+                "text":
+                completion.choices[0].text,
+                "finish_reason":
+                completion.choices[0].finish_reason,
+                "usage":
+                completion.usage,
             })
 
             # test seeded random sampling with multiple prompts
@@ -257,7 +288,7 @@ def compare_two_settings(model: str,
                                                    seed=33,
                                                    temperature=1.0)
 
-            results.append({
+            (ref_results if i == 0 else compare_results).append({
                 "test":
                 "seeded_sampling",
                 "text": [choice.text for choice in completion.choices],
@@ -275,10 +306,13 @@ def compare_two_settings(model: str,
                 temperature=0.0,
             )
 
-            results.append({
-                "test": "simple_list",
-                "text0": batch.choices[0].text,
-                "text1": batch.choices[1].text,
+            (ref_results if i == 0 else compare_results).append({
+                "test":
+                "simple_list",
+                "text0":
+                batch.choices[0].text,
+                "text1":
+                batch.choices[1].text,
             })
 
             # test streaming
@@ -294,18 +328,25 @@ def compare_two_settings(model: str,
                 assert len(chunk.choices) == 1
                 choice = chunk.choices[0]
                 texts[choice.index] += choice.text
-            results.append({
+            (ref_results if i == 0 else compare_results).append({
                 "test": "streaming",
                 "texts": texts,
             })
 
-    n = len(results) // 2
-    arg1_results = results[:n]
-    arg2_results = results[n:]
-    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
-        assert arg1_result == arg2_result, (
-            f"Results for {model=} are not the same with {arg1=} and {arg2=}. "
-            f"{arg1_result=} != {arg2_result=}")
+            if i > 0:
+                # if any setting fails, raise an error early
+                ref_args = all_args[0]
+                ref_envs = all_envs[0]
+                compare_args = all_args[i]
+                compare_envs = all_envs[i]
+                for ref_result, compare_result in zip(ref_results,
+                                                      compare_results):
+                    assert ref_result == compare_result, (
+                        f"Results for {model=} are not the same.\n"
+                        f"{ref_args=} {ref_envs=}\n"
+                        f"{compare_args=} {compare_envs=}\n"
+                        f"{ref_result=}\n"
+                        f"{compare_result=}\n")
 
 
 def init_test_distributed_environment(