[Misc] Deprecation Warning when setting --engine-use-ray (vllm-project#7424)

wallashss · youkaichao · njhill · Alvant · commit 2ab11320c2bb · 2024-10-26T09:49:15.000+03:00
Signed-off-by: Wallas Santos &lt;wallashss@ibm.com&gt;
Co-authored-by: youkaichao &lt;youkaichao@gmail.com&gt;
Co-authored-by: Nick Hill &lt;nickhill@us.ibm.com&gt;
Co-authored-by: youkaichao &lt;youkaichao@126.com&gt;
Signed-off-by: Alvant &lt;alvasian@yandex.ru&gt;
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
@@ -1,3 +1,4 @@
+import os
 import subprocess
 import sys
 import time
@@ -35,11 +36,17 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
         "127.0.0.1", "--tokenizer-pool-size",
         str(tokenizer_pool_size)
     ]
+
+    # Copy the environment variables and append `VLLM_ALLOW_ENGINE_USE_RAY=1`
+    # to prevent `--engine-use-ray` raises an exception due to it deprecation
+    env_vars = os.environ.copy()
+    env_vars["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
+
     if engine_use_ray:
         commands.append("--engine-use-ray")
     if worker_use_ray:
         commands.append("--worker-use-ray")
-    uvicorn_process = subprocess.Popen(commands)
+    uvicorn_process = subprocess.Popen(commands, env=env_vars)
     yield
     uvicorn_process.terminate()
 
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 from dataclasses import dataclass
 
 import pytest
@@ -106,11 +107,16 @@ async def test_new_requests_event():
     assert engine.engine.add_request_calls == 3
     assert engine.engine.step_calls == old_step_calls + 1
 
+    # Allow deprecated engine_use_ray to not raise exception
+    os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
+
     engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
     assert engine.get_model_config() is not None
     assert engine.get_tokenizer() is not None
     assert engine.get_decoding_config() is not None
 
+    os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
+
 
 def test_asyncio_run():
     wait_for_gpu_memory_to_clear(
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
@@ -23,7 +23,11 @@ def server():
         str(chatml_jinja_path),
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # Allow `--engine-use-ray`, otherwise the launch of the server throw
+    # an error due to try to use a deprecated feature
+    env_dict = {"VLLM_ALLOW_ENGINE_USE_RAY": "1"}
+    with RemoteOpenAIServer(MODEL_NAME, args,
+                            env_dict=env_dict) as remote_server:
         yield remote_server
 
 
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 from itertools import cycle
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
@@ -56,6 +57,11 @@ def __init__(
     ) -> None:
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
+
+        # Needed to engine_use_ray works as a deprecated feature,
+        # otherwise the following constructor will raise an exception
+        os.environ["VLLM_ALLOW_ENGINE_USE_RAY"] = "1"
+
         engine_args = AsyncEngineArgs(
             model=model,
             tokenizer=tokenizer,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -923,7 +923,13 @@ def add_cli_args(parser: FlexibleArgumentParser,
         parser.add_argument('--engine-use-ray',
                             action='store_true',
                             help='Use Ray to start the LLM engine in a '
-                            'separate process as the server process.')
+                            'separate process as the server process.'
+                            '(DEPRECATED. This argument is deprecated '
+                            'and will be removed in a future update. '
+                            'Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to force '
+                            'use it. See '
+                            'https://github.com/vllm-project/vllm/issues/7045.'
+                            ')')
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -29,6 +29,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -553,6 +554,20 @@ def __init__(self,
         self.log_requests = log_requests
         self.engine = self._init_engine(*args, **kwargs)
 
+        if self.engine_use_ray:
+            print_warning_once(
+                "DEPRECATED. `--engine-use-ray` is deprecated and will "
+                "be removed in a future update. "
+                "See https://github.com/vllm-project/vllm/issues/7045.")
+
+            if envs.VLLM_ALLOW_ENGINE_USE_RAY:
+                print_warning_once(
+                    "VLLM_ALLOW_ENGINE_USE_RAY is set, force engine use Ray")
+            else:
+                raise ValueError("`--engine-use-ray` is deprecated. "
+                                 "Set `VLLM_ALLOW_ENGINE_USE_RAY=1` to "
+                                 "force use it")
+
         self.background_loop: Optional[asyncio.Future] = None
         # We need to keep a reference to unshielded
         # task as well to prevent it from being garbage
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -55,6 +55,7 @@
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_TEST_FORCE_FP8_MARLIN: bool = False
+    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
     VLLM_PLUGINS: Optional[List[str]] = None
 
 
@@ -364,6 +365,14 @@ def get_default_config_root():
     (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
      ("1", "true")),
 
+    # If set, allow running the engine as a separate ray actor,
+    # which is a deprecated feature soon to be removed.
+    # See https://github.com/vllm-project/vllm/issues/7045
+    "VLLM_ALLOW_ENGINE_USE_RAY":
+    lambda:
+    (os.environ.get("VLLM_ALLOW_ENGINE_USE_RAY", "0").strip().lower() in
+     ("1", "true")),
+
     # a list of plugin names to load, separated by commas.
     # if this is not set, it means all plugins will be loaded
     # if this is set to an empty string, no plugins will be loaded