diff --git a/format.sh b/format.sh index aaec25a8aa0d..d110855f8c27 100755 --- a/format.sh +++ b/format.sh @@ -113,8 +113,11 @@ mypy vllm/logging --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml +# If git diff returns a file that is in the skip list, the file may be checked anyway: +# https://github.com/codespell-project/codespell/issues/1915 +# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem CODESPELL_EXCLUDES=( - '--skip' '*docs/source/_build/**,./tests/lora/data' + '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,tests/lora/data/**,build/**' ) # check spelling of specified files diff --git a/tests/async_engine/test_merge_async_iterators.py b/tests/async_engine/test_merge_async_iterators.py deleted file mode 100644 index ea453526c77f..000000000000 --- a/tests/async_engine/test_merge_async_iterators.py +++ /dev/null @@ -1,41 +0,0 @@ -import asyncio -from typing import AsyncIterator, Tuple - -import pytest - -from vllm.utils import merge_async_iterators - - -@pytest.mark.asyncio -async def test_merge_async_iterators(): - - async def mock_async_iterator(idx: int) -> AsyncIterator[str]: - try: - while True: - yield f"item from iterator {idx}" - await asyncio.sleep(0.1) - except asyncio.CancelledError: - pass - - iterators = [mock_async_iterator(i) for i in range(3)] - merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators( - *iterators) - - async def stream_output(generator: AsyncIterator[Tuple[int, str]]): - async for idx, output in generator: - print(f"idx: {idx}, output: {output}") - - task = asyncio.create_task(stream_output(merged_iterator)) - await asyncio.sleep(0.5) - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task - - for iterator in iterators: - try: - await asyncio.wait_for(anext(iterator), 1) - except StopAsyncIteration: - # All iterators should be cancelled and print this message. - print("Iterator was cancelled normally") - except (Exception, asyncio.CancelledError) as e: - raise AssertionError() from e diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 7a8d4b391561..dbd6acc2775a 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -4,16 +4,22 @@ # and debugging. import ray -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" @pytest.fixture(scope="module") -def server(): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(ray_ctx): + yield RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -24,22 +30,15 @@ def server(): "--enforce-eager", "--engine-use-ray" ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + yield server.get_async_client() @pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -48,7 +47,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_completion(server, client: openai.AsyncOpenAI): +async def test_single_completion(client: openai.AsyncOpenAI): completion = await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, @@ -74,7 +73,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_single_chat_session(server, client: openai.AsyncOpenAI): +async def test_single_chat_session(client: openai.AsyncOpenAI): messages = [{ "role": "system", "content": "you are a helpful assistant" diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 88cd4f98091f..ebab99bf2906 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -136,7 +136,6 @@ def test_append_slot_cow(): inputs={ "prompt": "one two three", "prompt_token_ids": [1, 2, 3], - "multi_modal_data": None }, block_size=block_size) @@ -311,7 +310,6 @@ def test_sliding_window_multi_seq(): inputs={ "prompt": "one two three", "prompt_token_ids": [0, 1, 2], - "multi_modal_data": None }, block_size=block_size) seq_group = SequenceGroup(request_id="1", diff --git a/tests/core/utils.py b/tests/core/utils.py index 1c5724090b69..b7b93542ad7a 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -25,7 +25,6 @@ def create_dummy_prompt( inputs={ "prompt": prompt_str, "prompt_token_ids": prompt_tokens, - "multi_modal_data": None, }, block_size=block_size) seq_group = SequenceGroup(request_id=request_id, @@ -57,11 +56,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={ - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, - }, + inputs={"prompt_token_ids": prompt_token_ids}, block_size=16, ) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 1d9c878ddde5..f795403e3d8a 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -15,11 +15,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={ - "prompt": "", - "prompt_token_ids": [], - "multi_modal_data": None, - }, + inputs={"prompt_token_ids": []}, block_size=16, eos_token_id=eos_token_id, ) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 2463ccde2bc8..ea6bf1ad6621 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -15,7 +15,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer -from ..utils import ServerRunner +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -80,9 +80,15 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): - ray.init() - server_runner = ServerRunner.remote([ +def ray_ctx(): + ray.init(runtime_env={"working_dir": VLLM_PATH}) + yield + ray.shutdown() + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, ray_ctx): + yield RemoteOpenAIServer([ "--model", MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -90,6 +96,8 @@ def server(zephyr_lora_files): "bfloat16", "--max-model-len", "8192", + "--gpu-memory-utilization", + "0.45", "--enforce-eager", "--gpu-memory-utilization", "0.75", @@ -105,16 +113,11 @@ def server(zephyr_lora_files): "--max-num-seqs", "128", ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def embedding_server(zephyr_lora_files): - ray.shutdown() - ray.init() - server_runner = ServerRunner.remote([ +def embedding_server(ray_ctx): + yield RemoteOpenAIServer([ "--model", EMBEDDING_MODEL_NAME, # use half precision for speed and memory savings in CI environment @@ -125,23 +128,23 @@ def embedding_server(zephyr_lora_files): "0.75", "--max-model-len", "8192", + "--gpu-memory-utilization", + "0.45", + "--enforce-eager", ]) - ray.get(server_runner.ready.remote()) - yield server_runner - ray.shutdown() @pytest.fixture(scope="module") -def client(): - client = openai.AsyncOpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) - yield client +def client(server): + yield server.get_async_client() -@pytest.mark.asyncio -async def test_check_models(server, client: openai.AsyncOpenAI): +@pytest.fixture(scope="module") +def embedding_client(embedding_server): + yield embedding_server.get_async_client() + + +async def test_check_models(client: openai.AsyncOpenAI): models = await client.models.list() models = models.data served_model = models[0] @@ -158,8 +161,7 @@ async def test_check_models(server, client: openai.AsyncOpenAI): "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_single_completion(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", max_tokens=5, @@ -190,8 +192,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_zero_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( model=MODEL_NAME, @@ -212,7 +213,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_single_chat_session(server, client: openai.AsyncOpenAI, +async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", @@ -253,8 +254,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_too_many_logprobs(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -313,7 +313,7 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_completion_streaming(server, client: openai.AsyncOpenAI, +async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str): prompt = "What is an LLM?" @@ -351,8 +351,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_chat_streaming(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): messages = [{ "role": "system", "content": "you are a helpful assistant" @@ -402,8 +401,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) -async def test_batch_completions(server, client: openai.AsyncOpenAI, - model_name: str): +async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): # test simple list batch = await client.completions.create( model=model_name, @@ -451,7 +449,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_logits_bias(server, client: openai.AsyncOpenAI): +async def test_logits_bias(client: openai.AsyncOpenAI): prompt = "Hello, my name is" max_tokens = 5 tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) @@ -501,7 +499,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_completion(server, client: openai.AsyncOpenAI, +async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -524,7 +522,7 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_json_chat(server, client: openai.AsyncOpenAI, +async def test_guided_json_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -571,7 +569,7 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, +async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -592,7 +590,7 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, +async def test_guided_regex_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -630,7 +628,7 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, +async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str): completion = await client.completions.create( model=MODEL_NAME, @@ -650,7 +648,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -689,7 +687,7 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, +async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str): with pytest.raises(openai.BadRequestError): _ = await client.completions.create( @@ -725,7 +723,7 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize("guided_decoding_backend", ["outlines", "lm-format-enforcer"]) -async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, +async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, guided_decoding_backend: str): messages = [{ "role": "system", @@ -754,7 +752,7 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_response_format_json_object(server, client: openai.AsyncOpenAI): +async def test_response_format_json_object(client: openai.AsyncOpenAI): for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, @@ -772,7 +770,7 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_extra_fields(server, client: openai.AsyncOpenAI): +async def test_extra_fields(client: openai.AsyncOpenAI): with pytest.raises(BadRequestError) as exc_info: await client.chat.completions.create( model=MODEL_NAME, @@ -788,7 +786,7 @@ async def test_extra_fields(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_complex_message_content(server, client: openai.AsyncOpenAI): +async def test_complex_message_content(client: openai.AsyncOpenAI): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ @@ -808,7 +806,7 @@ async def test_complex_message_content(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_custom_role(server, client: openai.AsyncOpenAI): +async def test_custom_role(client: openai.AsyncOpenAI): # Not sure how the model handles custom roles so we just check that # both string and complex message content are handled in the same way @@ -839,7 +837,7 @@ async def test_custom_role(server, client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_guided_grammar(server, client: openai.AsyncOpenAI): +async def test_guided_grammar(client: openai.AsyncOpenAI): simple_sql_grammar = """ start: select_statement @@ -879,7 +877,7 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI): "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) -async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, +async def test_echo_logprob_completion(client: openai.AsyncOpenAI, model_name: str): tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) # test using text and token IDs @@ -906,7 +904,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_long_seed(server, client: openai.AsyncOpenAI): +async def test_long_seed(client: openai.AsyncOpenAI): for seed in [ torch.iinfo(torch.long).min - 1, torch.iinfo(torch.long).max + 1 @@ -930,14 +928,14 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): "model_name", [EMBEDDING_MODEL_NAME], ) -async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, +async def test_single_embedding(embedding_client: openai.AsyncOpenAI, model_name: str): input = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await client.embeddings.create( + embeddings = await embedding_client.embeddings.create( model=model_name, input=input, encoding_format="float", @@ -969,14 +967,14 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, "model_name", [EMBEDDING_MODEL_NAME], ) -async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, +async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, model_name: str): # test List[str] inputs = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await client.embeddings.create( + embeddings = await embedding_client.embeddings.create( model=model_name, input=inputs, encoding_format="float", diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 1579d53a7fe2..5a60778bb186 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -6,7 +6,6 @@ import openai import pytest -import ray import torch from vllm import SamplingParams @@ -18,7 +17,7 @@ open_stream, serialize_vllm_model) -from ..utils import ServerRunner +from ..utils import RemoteOpenAIServer # yapf conflicts with isort for this docstring @@ -206,18 +205,13 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): openai_args = [ "--model", model_ref, "--dtype", "float16", "--load-format", "tensorizer", "--model-loader-extra-config", - json.dumps(model_loader_extra_config), "--port", "8000" + json.dumps(model_loader_extra_config), ] - server = ServerRunner.remote(openai_args) - - assert ray.get(server.ready.remote()) + server = RemoteOpenAIServer(openai_args) print("Server ready.") - client = openai.OpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", - ) + client = server.get_client() completion = client.completions.create(model=model_ref, prompt="Hello, my name is", max_tokens=5, diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 97864af88e40..0fbe3dae1ff0 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -74,7 +74,6 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, inputs={ "prompt": prompt, "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, }, block_size=block_size, eos_token_id=tokenizer.tokenizer.eos_token_id, diff --git a/tests/test_utils.py b/tests/test_utils.py index 54dc5c6f5bfb..cae7e3bf797b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,64 @@ +import asyncio +import sys +from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol, + Tuple, TypeVar) + import pytest -from vllm.utils import deprecate_kwargs +from vllm.utils import deprecate_kwargs, merge_async_iterators from .utils import error_on_warning +if sys.version_info < (3, 10): + if TYPE_CHECKING: + _AwaitableT = TypeVar("_AwaitableT", bound=Awaitable[Any]) + _AwaitableT_co = TypeVar("_AwaitableT_co", + bound=Awaitable[Any], + covariant=True) + + class _SupportsSynchronousAnext(Protocol[_AwaitableT_co]): + + def __anext__(self) -> _AwaitableT_co: + ... + + def anext(i: _SupportsSynchronousAnext[_AwaitableT], /) -> _AwaitableT: + return i.__anext__() + + +@pytest.mark.asyncio +async def test_merge_async_iterators(): + + async def mock_async_iterator(idx: int) -> AsyncIterator[str]: + try: + while True: + yield f"item from iterator {idx}" + await asyncio.sleep(0.1) + except asyncio.CancelledError: + pass + + iterators = [mock_async_iterator(i) for i in range(3)] + merged_iterator: AsyncIterator[Tuple[int, str]] = merge_async_iterators( + *iterators) + + async def stream_output(generator: AsyncIterator[Tuple[int, str]]): + async for idx, output in generator: + print(f"idx: {idx}, output: {output}") + + task = asyncio.create_task(stream_output(merged_iterator)) + await asyncio.sleep(0.5) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + for iterator in iterators: + try: + await asyncio.wait_for(anext(iterator), 1) + except StopAsyncIteration: + # All iterators should be cancelled and print this message. + print("Iterator was cancelled normally") + except (Exception, asyncio.CancelledError) as e: + raise AssertionError() from e + def test_deprecate_kwargs_always(): diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 1d4c74d6bd8d..8d019fe5f38c 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -126,7 +126,6 @@ def create_sequence(prompt_token_ids=None): inputs={ "prompt": "", "prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, }, block_size=16, ) diff --git a/tests/utils.py b/tests/utils.py index 329842911e15..530a67749fc6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,56 +4,107 @@ import time import warnings from contextlib import contextmanager +from typing import List +import openai import ray import requests from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) +from vllm.entrypoints.openai.cli_args import make_arg_parser from vllm.utils import get_open_port # Path to root of repository so that utilities can be imported by ray workers VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)) -@ray.remote(num_gpus=1) -class ServerRunner: +class RemoteOpenAIServer: + DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds - def __init__(self, args): - env = os.environ.copy() - env["PYTHONUNBUFFERED"] = "1" - self.proc = subprocess.Popen( - ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args, - env=env, - stdout=sys.stdout, - stderr=sys.stderr, + @ray.remote(num_gpus=1) + class _RemoteRunner: + + def __init__(self, cli_args: List[str], *, wait_url: str, + wait_timeout: float) -> None: + env = os.environ.copy() + env["PYTHONUNBUFFERED"] = "1" + self.proc = subprocess.Popen( + ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ + + cli_args, + env=env, + stdout=sys.stdout, + stderr=sys.stderr, + ) + + self._wait_for_server(url=wait_url, timeout=wait_timeout) + + def ready(self): + return True + + def _wait_for_server(self, *, url: str, timeout: float): + # run health check + start = time.time() + while True: + try: + if requests.get(url).status_code == 200: + break + except Exception as err: + if self.proc.poll() is not None: + raise RuntimeError( + "Server exited unexpectedly.") from err + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError( + "Server failed to start in time.") from err + + def __del__(self): + if hasattr(self, "proc"): + self.proc.terminate() + + def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None: + if auto_port: + if "-p" in cli_args or "--port" in cli_args: + raise ValueError("You have manually specified the port" + "when `auto_port=True`.") + + cli_args = cli_args + ["--port", str(get_open_port())] + + parser = make_arg_parser() + args = parser.parse_args(cli_args) + self.host = str(args.host or 'localhost') + self.port = int(args.port) + + self._runner = self._RemoteRunner.remote( + cli_args, + wait_url=self.url_for("health"), + wait_timeout=self.MAX_SERVER_START_WAIT_S) + + self._wait_until_ready() + + @property + def url_root(self) -> str: + return f"http://{self.host}:{self.port}" + + def url_for(self, *parts: str) -> str: + return self.url_root + "/" + "/".join(parts) + + def _wait_until_ready(self) -> None: + ray.get(self._runner.ready.remote()) + + def get_client(self): + return openai.OpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, + ) + + def get_async_client(self): + return openai.AsyncOpenAI( + base_url=self.url_for("v1"), + api_key=self.DUMMY_API_KEY, ) - self._wait_for_server() - - def ready(self): - return True - - def _wait_for_server(self): - # run health check - start = time.time() - while True: - try: - if requests.get( - "http://localhost:8000/health").status_code == 200: - break - except Exception as err: - if self.proc.poll() is not None: - raise RuntimeError("Server exited unexpectedly.") from err - - time.sleep(0.5) - if time.time() - start > self.MAX_SERVER_START_WAIT_S: - raise RuntimeError( - "Server failed to start in time.") from err - - def __del__(self): - if hasattr(self, "proc"): - self.proc.terminate() def init_test_distributed_environment( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 08bccf209b7c..c76960a5b2a6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -6,7 +6,6 @@ from transformers import GenerationConfig, PreTrainedTokenizer -import vllm from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, @@ -37,7 +36,7 @@ get_tokenizer_group) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import Counter +from vllm.utils import Counter, get_vllm_version logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -169,7 +168,7 @@ def __init__( "enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, seed=%d, served_model_name=%s)", - vllm.__version__, + get_vllm_version(), model_config.model, speculative_config, model_config.tokenizer, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 97b35262329e..5aa82488e459 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,6 @@ from prometheus_client import make_asgi_app from starlette.routing import Mount -import vllm import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -29,6 +28,7 @@ from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext +from vllm.utils import get_vllm_version TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -93,7 +93,7 @@ async def show_available_models(): @app.get("/version") async def show_version(): - ver = {"version": vllm.__version__} + ver = {"version": get_vllm_version()} return JSONResponse(content=ver) @@ -174,7 +174,7 @@ async def authentication(request: Request, call_next): raise ValueError(f"Invalid middleware {middleware}. " f"Must be a function or a class.") - logger.info("vLLM API server version %s", vllm.__version__) + logger.info("vLLM API server version %s", get_vllm_version()) logger.info("args: %s", args) if args.served_model_name is not None: diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 731f4f4a4028..243a5fe49a5b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -5,7 +5,6 @@ import aiohttp -import vllm from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (BatchRequestInput, @@ -14,7 +13,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import random_uuid +from vllm.utils import get_vllm_version, random_uuid logger = init_logger(__name__) @@ -135,7 +134,7 @@ async def main(args): if __name__ == "__main__": args = parse_args() - logger.info("vLLM API server version %s", vllm.__version__) + logger.info("vLLM API server version %s", get_vllm_version()) logger.info("args: %s", args) asyncio.run(main(args)) diff --git a/vllm/inputs.py b/vllm/inputs.py index f5d99b1b66b7..85c9cd84f5ed 100644 --- a/vllm/inputs.py +++ b/vllm/inputs.py @@ -126,5 +126,5 @@ class TextTokensPrompt(TypedDict): class LLMInputs(TypedDict): prompt_token_ids: List[int] - prompt: Optional[str] - multi_modal_data: Optional["MultiModalData"] + prompt: NotRequired[Optional[str]] + multi_modal_data: NotRequired[Optional["MultiModalData"]] diff --git a/vllm/sequence.py b/vllm/sequence.py index f8e9da6c7965..d9a9dea9a056 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -249,7 +249,7 @@ def __init__( @property def prompt(self) -> Optional[str]: - return self.inputs["prompt"] + return self.inputs.get("prompt") @property def prompt_token_ids(self) -> List[int]: @@ -257,7 +257,7 @@ def prompt_token_ids(self) -> List[int]: @property def multi_modal_data(self) -> Optional["MultiModalData"]: - return self.inputs["multi_modal_data"] + return self.inputs.get("multi_modal_data") @property def lora_int_id(self) -> int: diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 40a954a29493..80da1a87a0b8 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -16,6 +16,7 @@ import torch import vllm.envs as envs +from vllm.utils import get_vllm_version _config_home = envs.VLLM_CONFIG_ROOT _USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json") @@ -163,9 +164,8 @@ def _report_usage_once(self, model_architecture: str, ]) # vLLM information - import vllm # delayed import to prevent circular import self.context = usage_context.value - self.vllm_version = vllm.__version__ + self.vllm_version = get_vllm_version() self.model_architecture = model_architecture # Metadata diff --git a/vllm/utils.py b/vllm/utils.py index 85e045cb3b76..edc4da11a4b7 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -5,6 +5,7 @@ import os import socket import subprocess +import sys import tempfile import threading import uuid @@ -167,6 +168,13 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) +def get_vllm_version() -> str: + # Avoid circular import + import vllm + + return vllm.__version__ + + @lru_cache(maxsize=None) def get_vllm_instance_id(): """ @@ -234,9 +242,11 @@ async def consumer(): yield item except (Exception, asyncio.CancelledError) as e: for task in _tasks: - # NOTE: Pass the error msg in cancel() - # when only Python 3.9+ is supported. - task.cancel() + if sys.version_info >= (3, 9): + # msg parameter only supported in Python 3.9+ + task.cancel(e) + else: + task.cancel() raise e await asyncio.gather(*_tasks)