From 73ae9bf6c37341526e8faa7a55b3839acb354fe9 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 22 Feb 2025 08:34:53 -0500 Subject: [PATCH 1/4] add all lora functions Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_add_lora.py | 8 +- tests/lora/test_lora_functions.py | 138 ++++++++++++++++++++++ vllm/v1/engine/async_llm.py | 18 ++- vllm/v1/engine/core.py | 15 ++- vllm/v1/engine/core_client.py | 63 ++++++++-- vllm/v1/engine/llm_engine.py | 18 ++- vllm/v1/worker/gpu_worker.py | 11 +- vllm/v1/worker/lora_model_runner_mixin.py | 17 ++- 8 files changed, 266 insertions(+), 22 deletions(-) create mode 100644 tests/lora/test_lora_functions.py diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 2b421bfd9eb8..a31e309213d5 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -144,10 +144,10 @@ async def test_add_lora(): await requests_processing_time(llm, dummy_run_requests) # Run with warmup - for lr in warmup_run_requests: - await llm.add_lora(lr) - # Wait for the add_lora function to complete on the server side. - await asyncio.sleep(30) + add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests] + add_lora_results = await asyncio.gather(*add_lora_tasks) + # Test that all all_lora calls are successful + assert all(add_lora_results) time_with_add_lora = await requests_processing_time( llm, warmup_run_requests) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py new file mode 100644 index 000000000000..c388db7ce0fe --- /dev/null +++ b/tests/lora/test_lora_functions.py @@ -0,0 +1,138 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Script to test add_lora, remove_lora, pin_lora, list_loras functions. +""" + +from pathlib import Path +import pytest +from typing import List +import os + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.llm import LLM +from vllm.lora.request import LoRARequest + +from huggingface_hub import snapshot_download + +MODEL_PATH = "meta-llama/Llama-2-7b-hf" +LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" +LORA_RANK = 8 + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines_lora): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + +def make_lora_request(lora_id: int): + return LoRARequest(lora_name=f"{lora_id}", + lora_int_id=lora_id, + lora_path=LORA_MODULE_PATH) + + +def test_lora_functions_sync(): + + max_loras = 4 + # Create engine in eager-mode. Due to high max_loras, the CI can + # OOM during cuda-graph capture. + engine_args = EngineArgs( + model=MODEL_PATH, + enable_lora=True, + max_loras=max_loras, + max_lora_rank=LORA_RANK, + max_model_len=128, + gpu_memory_utilization=0.8, #avoid OOM + enforce_eager=True) + + llm = LLM.get_engine_class().from_engine_args(engine_args) + + def run_check(fn, args, expected: List): + fn(args) + assert set(llm.list_loras()) == set(expected) + + run_check(llm.add_lora, make_lora_request(1), [1]) + run_check(llm.add_lora, make_lora_request(2), [1, 2]) + + # Pin LoRA 1 and test that it is never removed on subsequent adds. + run_check(llm.pin_lora, 1, [1, 2]) + run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) + run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4]) + run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4]) + run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4]) + run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7]) + run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7]) + run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7]) + run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10]) + + # Remove LoRA 1 and continue adding. + run_check(llm.remove_lora, 1, [8, 9, 10]) + run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11]) + run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11]) + run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11]) + + # Remove all LoRAs + run_check(llm.remove_lora, 13, [12, 10, 11]) + run_check(llm.remove_lora, 12, [10, 11]) + run_check(llm.remove_lora, 11, [10]) + run_check(llm.remove_lora, 10, []) + + +@pytest.mark.asyncio +async def test_lora_functions_async(): + + if os.getenv("VLLM_USE_V1") == "0": + pytest.skip( + reason=f"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") + + # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` + # environment variable. reload vllm.enging.async_llm_engine as + # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the + # env var. + import importlib + + import vllm.engine.async_llm_engine + importlib.reload(vllm.engine.async_llm_engine) + from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) + + max_loras = 4 + engine_args = AsyncEngineArgs( + model=MODEL_PATH, + enable_lora=True, + max_loras=max_loras, + max_lora_rank=LORA_RANK, + max_model_len=128, + gpu_memory_utilization=0.8, + enforce_eager=True) + + async def run_check(fn, args, expected: List): + await fn(args) + assert set(await llm.list_loras()) == set(expected) + + async with build_async_engine_client_from_engine_args(engine_args) as llm: + await run_check(llm.add_lora, make_lora_request(1), [1]) + await run_check(llm.add_lora, make_lora_request(2), [1, 2]) + + # Pin LoRA 1 and test that it is never removed on subsequent adds. + await run_check(llm.pin_lora, 1, [1, 2]) + await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) + await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4]) + await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4]) + await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4]) + await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7]) + await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7]) + await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7]) + await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10]) + + # Remove LoRA 1 and continue adding. + await run_check(llm.remove_lora, 1, [8, 9, 10]) + await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11]) + await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11]) + await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11]) + + # Remove all LoRAs + await run_check(llm.remove_lora, 13, [12, 10, 11]) + await run_check(llm.remove_lora, 12, [10, 11]) + await run_check(llm.remove_lora, 11, [10]) + await run_check(llm.remove_lora, 10, []) \ No newline at end of file diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 670454c283da..4253497b5100 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import AsyncGenerator, List, Mapping, Optional, Type, Union +from typing import AsyncGenerator, List, Mapping, Optional, Type, Union, Set import numpy as np @@ -367,9 +367,21 @@ async def sleep(self, level: int = 1) -> None: async def wake_up(self) -> None: await self.engine_core.wake_up_async() - async def add_lora(self, lora_request: LoRARequest) -> None: + async def add_lora(self, lora_request: LoRARequest) -> bool: """Load a new LoRA adapter into the engine for future requests.""" - await self.engine_core.add_lora_async(lora_request) + return await self.engine_core.add_lora_async(lora_request) + + async def remove_lora(self, lora_id: int) -> bool: + """Remove an already loaded LoRA adapter.""" + return await self.engine_core.remove_lora_async(lora_id) + + async def list_loras(self) -> Set[int]: + """List all registered adapters.""" + return await self.engine_core.list_loras_async() + + async def pin_lora(self, lora_id: int) -> bool: + """Prevent an adapter from being evicted.""" + return await self.engine_core.pin_lora_async(lora_id) @property def is_running(self) -> bool: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 85c97293af8b..7533463d8ee9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,7 +7,7 @@ from concurrent.futures import Future from inspect import isclass, signature from multiprocessing.connection import Connection -from typing import Any, List, Optional, Tuple, Type +from typing import Any, List, Optional, Tuple, Type, Set import msgspec import psutil @@ -222,8 +222,17 @@ def wake_up(self): def execute_dummy_batch(self): self.model_executor.collective_rpc("execute_dummy_batch") - def add_lora(self, lora_request: LoRARequest) -> None: - self.model_executor.add_lora(lora_request) + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.model_executor.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.model_executor.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_executor.list_loras() + + def pin_lora(self, lora_id: int) -> bool: + return self.model_executor.pin_lora(lora_id) class EngineCoreProc(EngineCore): diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 527aa72833ba..9e0b3bd9a117 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -9,7 +9,7 @@ from abc import ABC, abstractmethod from concurrent.futures import Future from threading import Thread -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Dict, List, Optional, Set, Type, Union import zmq import zmq.asyncio @@ -96,7 +96,16 @@ async def execute_dummy_batch_async(self) -> None: def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError - def add_lora(self, lora_request: LoRARequest) -> None: + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + def list_loras(self) -> Set[int]: + raise NotImplementedError + + def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError async def get_output_async(self) -> EngineCoreOutputs: @@ -120,7 +129,16 @@ async def wake_up_async(self) -> None: async def abort_requests_async(self, request_ids: List[str]) -> None: raise NotImplementedError - async def add_lora_async(self, lora_request: LoRARequest) -> None: + async def add_lora_async(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + async def remove_lora_async(self, lora_id: int) -> bool: + raise NotImplementedError + + async def list_loras_async(self) -> Set[int]: + raise NotImplementedError + + async def pin_lora_async(self, lora_id: int) -> bool: raise NotImplementedError @@ -165,8 +183,17 @@ def wake_up(self) -> None: def execute_dummy_batch(self) -> None: self.engine_core.execute_dummy_batch() - def add_lora(self, lora_request: LoRARequest) -> None: - self.engine_core.add_lora(lora_request) + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.engine_core.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.engine_core.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.engine_core.list_loras() + + def pin_lora(self, lora_id: int) -> bool: + return self.engine_core.pin_lora(lora_id) class MPClient(EngineCoreClient): @@ -331,8 +358,17 @@ def profile(self, is_start: bool = True) -> None: def reset_prefix_cache(self) -> None: self._call_utility("reset_prefix_cache") - def add_lora(self, lora_request: LoRARequest) -> None: - self._call_utility("add_lora", lora_request) + def add_lora(self, lora_request: LoRARequest) -> bool: + return self._call_utility("add_lora", lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self._call_utility("remove_lora", lora_id) + + def list_loras(self) -> Set[int]: + return self._call_utility("list_loras") + + def pin_lora(self, lora_id: int) -> bool: + return self._call_utility("pin_lora", lora_id) def sleep(self, level: int = 1) -> None: self._call_utility("sleep", level) @@ -429,5 +465,14 @@ async def wake_up_async(self) -> None: async def execute_dummy_batch_async(self) -> None: await self._call_utility_async("execute_dummy_batch") - async def add_lora_async(self, lora_request: LoRARequest) -> None: - await self._call_utility_async("add_lora", lora_request) + async def add_lora_async(self, lora_request: LoRARequest) -> bool: + return await self._call_utility_async("add_lora", lora_request) + + async def remove_lora_async(self, lora_id: int) -> bool: + return await self._call_utility_async("remove_lora", lora_id) + + async def list_loras_async(self) -> Set[int]: + return await self._call_utility_async("list_loras") + + async def pin_lora_async(self, lora_id: int) -> bool: + return await self._call_utility_async("pin_lora", lora_id) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 33b1ddc0f6fe..67c3dc5f1a20 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Mapping, Optional, Type, Union +from typing import Dict, List, Mapping, Optional, Type, Union, Set from typing_extensions import TypeVar @@ -217,3 +217,19 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group + + def add_lora(self, lora_request: LoRARequest) -> bool: + """Load a new LoRA adapter into the engine for future requests.""" + return self.engine_core.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + """Remove an already loaded LoRA adapter.""" + return self.engine_core.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + """List all registered adapters.""" + return self.engine_core.list_loras() + + def pin_lora(self, lora_id: int) -> bool: + """Prevent an adapter from being evicted.""" + return self.engine_core.pin_lora(lora_id) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d9a415aee528..fcdad577348a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -2,7 +2,7 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Optional, Set import torch import torch.distributed @@ -240,6 +240,15 @@ def execute_dummy_batch(self) -> None: def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) + def remove_lora(self, lora_id: int) -> bool: + return self.model_runner.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() + + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + def check_health(self) -> None: # worker will always be healthy as long as it's running. return diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 053897da0aa7..731e758e6e74 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -131,4 +131,19 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig, def add_lora(self, lora_request: LoRARequest) -> bool: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_adapter(lora_request) \ No newline at end of file + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() \ No newline at end of file From b8a406e4474b76408709c17a13735fba4dc2fdd9 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sun, 23 Feb 2025 11:05:06 -0500 Subject: [PATCH 2/4] fix comments Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_lora_functions.py | 49 +++++++++++++++---------------- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index c388db7ce0fe..1309848868b4 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -3,21 +3,20 @@ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ -from pathlib import Path -import pytest -from typing import List import os +from typing import List + +import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.llm import LLM from vllm.lora.request import LoRARequest -from huggingface_hub import snapshot_download - MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 + @pytest.fixture(autouse=True) def v1(run_with_both_engines_lora): # Simple autouse wrapper to run both engines for each test @@ -25,6 +24,7 @@ def v1(run_with_both_engines_lora): # test in a package pass + def make_lora_request(lora_id: int): return LoRARequest(lora_name=f"{lora_id}", lora_int_id=lora_id, @@ -36,14 +36,13 @@ def test_lora_functions_sync(): max_loras = 4 # Create engine in eager-mode. Due to high max_loras, the CI can # OOM during cuda-graph capture. - engine_args = EngineArgs( - model=MODEL_PATH, - enable_lora=True, - max_loras=max_loras, - max_lora_rank=LORA_RANK, - max_model_len=128, - gpu_memory_utilization=0.8, #avoid OOM - enforce_eager=True) + engine_args = EngineArgs(model=MODEL_PATH, + enable_lora=True, + max_loras=max_loras, + max_lora_rank=LORA_RANK, + max_model_len=128, + gpu_memory_utilization=0.8, + enforce_eager=True) llm = LLM.get_engine_class().from_engine_args(engine_args) @@ -56,7 +55,7 @@ def run_check(fn, args, expected: List): # Pin LoRA 1 and test that it is never removed on subsequent adds. run_check(llm.pin_lora, 1, [1, 2]) - run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) + run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4]) run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4]) run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4]) @@ -83,7 +82,8 @@ async def test_lora_functions_async(): if os.getenv("VLLM_USE_V1") == "0": pytest.skip( - reason=f"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") + reason= + "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` # environment variable. reload vllm.enging.async_llm_engine as @@ -97,14 +97,13 @@ async def test_lora_functions_async(): build_async_engine_client_from_engine_args) max_loras = 4 - engine_args = AsyncEngineArgs( - model=MODEL_PATH, - enable_lora=True, - max_loras=max_loras, - max_lora_rank=LORA_RANK, - max_model_len=128, - gpu_memory_utilization=0.8, - enforce_eager=True) + engine_args = AsyncEngineArgs(model=MODEL_PATH, + enable_lora=True, + max_loras=max_loras, + max_lora_rank=LORA_RANK, + max_model_len=128, + gpu_memory_utilization=0.8, + enforce_eager=True) async def run_check(fn, args, expected: List): await fn(args) @@ -116,7 +115,7 @@ async def run_check(fn, args, expected: List): # Pin LoRA 1 and test that it is never removed on subsequent adds. await run_check(llm.pin_lora, 1, [1, 2]) - await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) + await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4]) await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4]) await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4]) @@ -135,4 +134,4 @@ async def run_check(fn, args, expected: List): await run_check(llm.remove_lora, 13, [12, 10, 11]) await run_check(llm.remove_lora, 12, [10, 11]) await run_check(llm.remove_lora, 11, [10]) - await run_check(llm.remove_lora, 10, []) \ No newline at end of file + await run_check(llm.remove_lora, 10, []) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4253497b5100..6577a21c15ee 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import AsyncGenerator, List, Mapping, Optional, Type, Union, Set +from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union import numpy as np diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7533463d8ee9..041896f1c7cc 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,7 +7,7 @@ from concurrent.futures import Future from inspect import isclass, signature from multiprocessing.connection import Connection -from typing import Any, List, Optional, Tuple, Type, Set +from typing import Any, List, Optional, Set, Tuple, Type import msgspec import psutil diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 67c3dc5f1a20..0812dcd65a27 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Mapping, Optional, Type, Union, Set +from typing import Dict, List, Mapping, Optional, Set, Type, Union from typing_extensions import TypeVar From c3102c4b7c9017283839e7f25a2ccc19d74d1df8 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sun, 23 Feb 2025 13:00:02 -0500 Subject: [PATCH 3/4] remove whitespace Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_add_lora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index a31e309213d5..788717804987 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -145,7 +145,7 @@ async def test_add_lora(): # Run with warmup add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests] - add_lora_results = await asyncio.gather(*add_lora_tasks) + add_lora_results = await asyncio.gather(*add_lora_tasks) # Test that all all_lora calls are successful assert all(add_lora_results) time_with_add_lora = await requests_processing_time( From a020418d73dcd54885c8f38660c8e2d4c8c528c5 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 25 Feb 2025 01:02:29 -0500 Subject: [PATCH 4/4] fix tests Signed-off-by: Varun Sundar Rabindranath --- tests/lora/test_add_lora.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 788717804987..70b058b201d6 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -7,6 +7,7 @@ import pytest from huggingface_hub import snapshot_download +import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest @@ -146,8 +147,12 @@ async def test_add_lora(): # Run with warmup add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests] add_lora_results = await asyncio.gather(*add_lora_tasks) - # Test that all all_lora calls are successful - assert all(add_lora_results) + if env.VLLM_USE_V1: + # Test that all all_lora calls are successful. + assert all(add_lora_results) + else: + # No way to check V0 engine results as the calls just return None. + pass time_with_add_lora = await requests_processing_time( llm, warmup_run_requests)