From 7faa2999a58e07895631ec49b4a98c2a0cc4626e Mon Sep 17 00:00:00 2001 From: roy Date: Wed, 24 Apr 2024 23:08:39 +0800 Subject: [PATCH 1/8] fix --- vllm/engine/async_llm_engine.py | 10 +++++++++- vllm/engine/llm_engine.py | 4 ++++ vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3a2f7db67935..67b69c0f5884 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -7,7 +7,7 @@ from transformers import PreTrainedTokenizer -from vllm.config import ModelConfig +from vllm.config import DecodingConfig, ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.engine.ray_utils import initialize_ray_cluster, ray @@ -697,6 +697,14 @@ async def get_model_config(self) -> ModelConfig: else: return self.engine.get_model_config() + async def get_decoding_config(self) -> DecodingConfig: + """Get the decoding configuration of the vLLM engine.""" + if self.engine_use_ray: + return await self.engine.get_decoding_config.remote( # type: ignore + ) + else: + return self.engine.get_decoding_config() + async def do_log_stats(self) -> None: if self.engine_use_ray: await self.engine.do_log_stats.remote() # type: ignore diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 19e58fb1722c..1e207316c2f2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -450,6 +450,10 @@ def get_model_config(self) -> ModelConfig: """Gets the model configuration.""" return self.model_config + def get_decoding_config(self) -> DecodingConfig: + """Gets the decoding configuration.""" + return self.decoding_config + def get_num_unfinished_requests(self) -> int: """Gets the number of unfinished requests.""" return self.scheduler.get_num_unfinished_seq_groups() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2ff335eb7107..41936f4ebd31 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -68,7 +68,7 @@ async def create_chat_completion( request, prompt=prompt) sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) - decoding_config = self.engine.engine.decoding_config + decoding_config = await self.engine.get_decoding_config() guided_decoding_backend = request.guided_decoding_backend \ or decoding_config.guided_decoding_backend guided_decode_logits_processor = ( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 211b2e0424c3..10645c874fff 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -88,7 +88,7 @@ async def create_completion(self, request: CompletionRequest, try: sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) - decoding_config = self.engine.engine.decoding_config + decoding_config = await self.engine.get_decoding_config() guided_decoding_backend = request.guided_decoding_backend \ or decoding_config.guided_decoding_backend guided_decode_logit_processor = ( From 4f6d77742570e81d1375edb2f5acc30806790fac Mon Sep 17 00:00:00 2001 From: roy Date: Wed, 24 Apr 2024 23:31:38 +0800 Subject: [PATCH 2/8] test --- tests/async_engine/test_async_llm_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index cb125a7bfec3..b69cdc0a2140 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -91,4 +91,6 @@ async def test_new_requests_event(): assert engine.engine.step_calls == old_step_calls + 1 engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) + assert engine.get_model_config() is not None assert engine.get_tokenizer() is not None + assert engine.get_decoding_config() is not None From 1b2e823481f570b392c18d4a5b3f0470f65e49ef Mon Sep 17 00:00:00 2001 From: roy Date: Thu, 25 Apr 2024 21:15:41 +0800 Subject: [PATCH 3/8] add test --- tests/async_engine/test_api_server_ray.py | 154 ++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 tests/async_engine/test_api_server_ray.py diff --git a/tests/async_engine/test_api_server_ray.py b/tests/async_engine/test_api_server_ray.py new file mode 100644 index 000000000000..e761020b7b98 --- /dev/null +++ b/tests/async_engine/test_api_server_ray.py @@ -0,0 +1,154 @@ +# imports for guided decoding tests +import os +import subprocess +import sys +import time + +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray +import requests + +MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds +# any model with a chat template should work here +MODEL_NAME = "facebook/opt-125m" + +@ray.remote(num_gpus=1) +class ServerRunner: + + def __init__(self, args): + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + self.proc = subprocess.Popen( + ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + self._wait_for_server() + + def ready(self): + return True + + def _wait_for_server(self): + # run health check + start = time.time() + while True: + try: + if requests.get( + "http://localhost:8000/health").status_code == 200: + break + except Exception as err: + if self.proc.poll() is not None: + raise RuntimeError("Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > MAX_SERVER_START_WAIT_S: + raise RuntimeError( + "Server failed to start in time.") from err + + def __del__(self): + if hasattr(self, "proc"): + self.proc.terminate() + +@pytest.fixture(scope="session") +def server(): + ray.init() + server_runner = ServerRunner.remote([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--max-model-len", + "2048", + "--enforce-eager", + "--engine-use-ray" + ]) + ray.get(server_runner.ready.remote()) + yield server_runner + ray.shutdown() + + +@pytest.fixture(scope="session") +def client(): + client = openai.AsyncOpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", + ) + yield client + + +@pytest.mark.asyncio +async def test_check_models(server, client: openai.AsyncOpenAI): + models = await client.models.list() + models = models.data + served_model = models[0] + assert served_model.id == MODEL_NAME + assert all(model.root == MODEL_NAME for model in models) + + +@pytest.mark.asyncio +async def test_single_completion(server, client: openai.AsyncOpenAI): + completion = await client.completions.create(model=MODEL_NAME, + prompt="Hello, my name is", + max_tokens=5, + temperature=0.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + assert completion.choices[0].text is not None and len( + completion.choices[0].text) >= 5 + assert completion.choices[0].finish_reason == "length" + assert completion.usage == openai.types.CompletionUsage( + completion_tokens=5, prompt_tokens=6, total_tokens=11) + + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + assert completion.choices[0].text is not None and len( + completion.choices[0].text) >= 5 + +@pytest.mark.asyncio +async def test_single_chat_session(server, client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + # test single completion + chat_completion = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5) + assert chat_completion.id is not None + assert chat_completion.choices is not None and len( + chat_completion.choices) == 1 + assert chat_completion.choices[0].message is not None + assert chat_completion.choices[0].logprobs is not None + assert chat_completion.choices[0].logprobs.top_logprobs is not None + assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 From 00c58ddb4199e1ff9e42de086e27017590ebe26e Mon Sep 17 00:00:00 2001 From: roy Date: Thu, 25 Apr 2024 21:19:08 +0800 Subject: [PATCH 4/8] format --- tests/async_engine/test_api_server_ray.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/async_engine/test_api_server_ray.py b/tests/async_engine/test_api_server_ray.py index e761020b7b98..4b97af88012b 100644 --- a/tests/async_engine/test_api_server_ray.py +++ b/tests/async_engine/test_api_server_ray.py @@ -15,6 +15,7 @@ # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" + @ray.remote(num_gpus=1) class ServerRunner: @@ -53,6 +54,7 @@ def __del__(self): if hasattr(self, "proc"): self.proc.terminate() + @pytest.fixture(scope="session") def server(): ray.init() @@ -115,6 +117,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): assert completion.choices[0].text is not None and len( completion.choices[0].text) >= 5 + @pytest.mark.asyncio async def test_single_chat_session(server, client: openai.AsyncOpenAI): messages = [{ From af498e7e623dea6de2bb5a53ce8bb74d50b2e917 Mon Sep 17 00:00:00 2001 From: roy Date: Fri, 26 Apr 2024 20:27:31 +0800 Subject: [PATCH 5/8] change folder --- .../test_openapi_server_ray.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{async_engine/test_api_server_ray.py => entrypoints/test_openapi_server_ray.py} (100%) diff --git a/tests/async_engine/test_api_server_ray.py b/tests/entrypoints/test_openapi_server_ray.py similarity index 100% rename from tests/async_engine/test_api_server_ray.py rename to tests/entrypoints/test_openapi_server_ray.py From 668362abea7195a538976c96d1a3555eac97bec8 Mon Sep 17 00:00:00 2001 From: roy Date: Sat, 27 Apr 2024 17:50:54 +0800 Subject: [PATCH 6/8] fix test --- tests/entrypoints/test_openapi_server_ray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openapi_server_ray.py b/tests/entrypoints/test_openapi_server_ray.py index 4b97af88012b..78d8f3394a2e 100644 --- a/tests/entrypoints/test_openapi_server_ray.py +++ b/tests/entrypoints/test_openapi_server_ray.py @@ -57,7 +57,7 @@ def __del__(self): @pytest.fixture(scope="session") def server(): - ray.init() + ray.init(ignore_reinit_error=True) server_runner = ServerRunner.remote([ "--model", MODEL_NAME, From 9b141e059a86501a3b94f9b82755f3cbe926647d Mon Sep 17 00:00:00 2001 From: roy Date: Sat, 27 Apr 2024 18:43:00 +0800 Subject: [PATCH 7/8] change scope --- tests/entrypoints/test_openapi_server_ray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/test_openapi_server_ray.py b/tests/entrypoints/test_openapi_server_ray.py index 78d8f3394a2e..afa2bbf31262 100644 --- a/tests/entrypoints/test_openapi_server_ray.py +++ b/tests/entrypoints/test_openapi_server_ray.py @@ -55,7 +55,7 @@ def __del__(self): self.proc.terminate() -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def server(): ray.init(ignore_reinit_error=True) server_runner = ServerRunner.remote([ @@ -74,7 +74,7 @@ def server(): ray.shutdown() -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def client(): client = openai.AsyncOpenAI( base_url="http://localhost:8000/v1", From 7e80f5845052eb271ffb50eb83bd2dcb8ac5ee1c Mon Sep 17 00:00:00 2001 From: roy Date: Sat, 27 Apr 2024 18:56:10 +0800 Subject: [PATCH 8/8] resolve ray conflict --- .../test_openapi_server_ray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename tests/{entrypoints => async_engine}/test_openapi_server_ray.py (98%) diff --git a/tests/entrypoints/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py similarity index 98% rename from tests/entrypoints/test_openapi_server_ray.py rename to tests/async_engine/test_openapi_server_ray.py index afa2bbf31262..4b97af88012b 100644 --- a/tests/entrypoints/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -55,9 +55,9 @@ def __del__(self): self.proc.terminate() -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def server(): - ray.init(ignore_reinit_error=True) + ray.init() server_runner = ServerRunner.remote([ "--model", MODEL_NAME, @@ -74,7 +74,7 @@ def server(): ray.shutdown() -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def client(): client = openai.AsyncOpenAI( base_url="http://localhost:8000/v1",