From 73ae9bf6c37341526e8faa7a55b3839acb354fe9 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varun@neuralmagic.com>
Date: Sat, 22 Feb 2025 08:34:53 -0500
Subject: [PATCH 1/4] add all lora functions

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/test_add_lora.py               |   8 +-
 tests/lora/test_lora_functions.py         | 138 ++++++++++++++++++++++
 vllm/v1/engine/async_llm.py               |  18 ++-
 vllm/v1/engine/core.py                    |  15 ++-
 vllm/v1/engine/core_client.py             |  63 ++++++++--
 vllm/v1/engine/llm_engine.py              |  18 ++-
 vllm/v1/worker/gpu_worker.py              |  11 +-
 vllm/v1/worker/lora_model_runner_mixin.py |  17 ++-
 8 files changed, 266 insertions(+), 22 deletions(-)
 create mode 100644 tests/lora/test_lora_functions.py

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 2b421bfd9eb8..a31e309213d5 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -144,10 +144,10 @@ async def test_add_lora():
         await requests_processing_time(llm, dummy_run_requests)
 
         # Run with warmup
-        for lr in warmup_run_requests:
-            await llm.add_lora(lr)
-        # Wait for the add_lora function to complete on the server side.
-        await asyncio.sleep(30)
+        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
+        add_lora_results = await asyncio.gather(*add_lora_tasks) 
+        # Test that all all_lora calls are successful
+        assert all(add_lora_results)
         time_with_add_lora = await requests_processing_time(
             llm, warmup_run_requests)
 
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
new file mode 100644
index 000000000000..c388db7ce0fe
--- /dev/null
+++ b/tests/lora/test_lora_functions.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Script to test add_lora, remove_lora, pin_lora, list_loras functions.
+"""
+
+from pathlib import Path
+import pytest
+from typing import List
+import os
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.llm import LLM
+from vllm.lora.request import LoRARequest
+
+from huggingface_hub import snapshot_download
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+LORA_RANK = 8
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+def make_lora_request(lora_id: int):
+    return LoRARequest(lora_name=f"{lora_id}",
+                       lora_int_id=lora_id,
+                       lora_path=LORA_MODULE_PATH)
+
+
+def test_lora_functions_sync():
+
+    max_loras = 4
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,  #avoid OOM
+        enforce_eager=True)
+
+    llm = LLM.get_engine_class().from_engine_args(engine_args)
+
+    def run_check(fn, args, expected: List):
+        fn(args)
+        assert set(llm.list_loras()) == set(expected)
+
+    run_check(llm.add_lora, make_lora_request(1), [1])
+    run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+    # Pin LoRA 1 and test that it is never removed on subsequent adds.
+    run_check(llm.pin_lora, 1, [1, 2])
+    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) 
+    run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+    run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+    run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+    run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+    run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+    run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+    run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+    # Remove LoRA 1 and continue adding.
+    run_check(llm.remove_lora, 1, [8, 9, 10])
+    run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+    # Remove all LoRAs
+    run_check(llm.remove_lora, 13, [12, 10, 11])
+    run_check(llm.remove_lora, 12, [10, 11])
+    run_check(llm.remove_lora, 11, [10])
+    run_check(llm.remove_lora, 10, [])
+
+
+@pytest.mark.asyncio
+async def test_lora_functions_async():
+
+    if os.getenv("VLLM_USE_V1") == "0":
+        pytest.skip(
+        reason=f"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
+
+    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
+    # environment variable. reload vllm.enging.async_llm_engine as
+    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
+    # env var.
+    import importlib
+
+    import vllm.engine.async_llm_engine
+    importlib.reload(vllm.engine.async_llm_engine)
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
+
+    max_loras = 4
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8, 
+        enforce_eager=True)
+
+    async def run_check(fn, args, expected: List):
+        await fn(args)
+        assert set(await llm.list_loras()) == set(expected)
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+        await run_check(llm.add_lora, make_lora_request(1), [1])
+        await run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+        # Pin LoRA 1 and test that it is never removed on subsequent adds.
+        await run_check(llm.pin_lora, 1, [1, 2])
+        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) 
+        await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+        await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+        await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+        # Remove LoRA 1 and continue adding.
+        await run_check(llm.remove_lora, 1, [8, 9, 10])
+        await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+        # Remove all LoRAs
+        await run_check(llm.remove_lora, 13, [12, 10, 11])
+        await run_check(llm.remove_lora, 12, [10, 11])
+        await run_check(llm.remove_lora, 11, [10])
+        await run_check(llm.remove_lora, 10, [])
\ No newline at end of file
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 670454c283da..4253497b5100 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, List, Mapping, Optional, Type, Union, Set
 
 import numpy as np
 
@@ -367,9 +367,21 @@ async def sleep(self, level: int = 1) -> None:
     async def wake_up(self) -> None:
         await self.engine_core.wake_up_async()
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
-        await self.engine_core.add_lora_async(lora_request)
+        return await self.engine_core.add_lora_async(lora_request)
+
+    async def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return await self.engine_core.remove_lora_async(lora_id)
+
+    async def list_loras(self) -> Set[int]:
+        """List all registered adapters."""
+        return await self.engine_core.list_loras_async()
+
+    async def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return await self.engine_core.pin_lora_async(lora_id)
 
     @property
     def is_running(self) -> bool:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 85c97293af8b..7533463d8ee9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,7 +7,7 @@
 from concurrent.futures import Future
 from inspect import isclass, signature
 from multiprocessing.connection import Connection
-from typing import Any, List, Optional, Tuple, Type
+from typing import Any, List, Optional, Tuple, Type, Set
 
 import msgspec
 import psutil
@@ -222,8 +222,17 @@ def wake_up(self):
     def execute_dummy_batch(self):
         self.model_executor.collective_rpc("execute_dummy_batch")
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self.model_executor.add_lora(lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_executor.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_executor.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_executor.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
 
 
 class EngineCoreProc(EngineCore):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 527aa72833ba..9e0b3bd9a117 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -9,7 +9,7 @@
 from abc import ABC, abstractmethod
 from concurrent.futures import Future
 from threading import Thread
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -96,7 +96,16 @@ async def execute_dummy_batch_async(self) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError
+
+    def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
     async def get_output_async(self) -> EngineCoreOutputs:
@@ -120,7 +129,16 @@ async def wake_up_async(self) -> None:
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    async def add_lora_async(self, lora_request: LoRARequest) -> None:
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    async def list_loras_async(self) -> Set[int]:
+        raise NotImplementedError
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
 
@@ -165,8 +183,17 @@ def wake_up(self) -> None:
     def execute_dummy_batch(self) -> None:
         self.engine_core.execute_dummy_batch()
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine_core.add_lora(lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.engine_core.pin_lora(lora_id)
 
 
 class MPClient(EngineCoreClient):
@@ -331,8 +358,17 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         self._call_utility("reset_prefix_cache")
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self._call_utility("add_lora", lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self._call_utility("add_lora", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self._call_utility("remove_lora", lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self._call_utility("list_loras")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self._call_utility("pin_lora", lora_id)
 
     def sleep(self, level: int = 1) -> None:
         self._call_utility("sleep", level)
@@ -429,5 +465,14 @@ async def wake_up_async(self) -> None:
     async def execute_dummy_batch_async(self) -> None:
         await self._call_utility_async("execute_dummy_batch")
 
-    async def add_lora_async(self, lora_request: LoRARequest) -> None:
-        await self._call_utility_async("add_lora", lora_request)
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        return await self._call_utility_async("add_lora", lora_request)
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        return await self._call_utility_async("remove_lora", lora_id)
+
+    async def list_loras_async(self) -> Set[int]:
+        return await self._call_utility_async("list_loras")
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
+        return await self._call_utility_async("pin_lora", lora_id)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 33b1ddc0f6fe..67c3dc5f1a20 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Mapping, Optional, Type, Union
+from typing import Dict, List, Mapping, Optional, Type, Union, Set
 
 from typing_extensions import TypeVar
 
@@ -217,3 +217,19 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d9a415aee528..fcdad577348a 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Set
 
 import torch
 import torch.distributed
@@ -240,6 +240,15 @@ def execute_dummy_batch(self) -> None:
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
 
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
     def check_health(self) -> None:
         # worker will always be healthy as long as it's running.
         return
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 053897da0aa7..731e758e6e74 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -131,4 +131,19 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig,
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_adapter(lora_request)
\ No newline at end of file
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
\ No newline at end of file

From b8a406e4474b76408709c17a13735fba4dc2fdd9 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varun@neuralmagic.com>
Date: Sun, 23 Feb 2025 11:05:06 -0500
Subject: [PATCH 2/4] fix comments

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/test_lora_functions.py | 49 +++++++++++++++----------------
 vllm/v1/engine/async_llm.py       |  2 +-
 vllm/v1/engine/core.py            |  2 +-
 vllm/v1/engine/llm_engine.py      |  2 +-
 4 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index c388db7ce0fe..1309848868b4 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -3,21 +3,20 @@
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
 
-from pathlib import Path
-import pytest
-from typing import List
 import os
+from typing import List
+
+import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.llm import LLM
 from vllm.lora.request import LoRARequest
 
-from huggingface_hub import snapshot_download
-
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8
 
+
 @pytest.fixture(autouse=True)
 def v1(run_with_both_engines_lora):
     # Simple autouse wrapper to run both engines for each test
@@ -25,6 +24,7 @@ def v1(run_with_both_engines_lora):
     # test in a package
     pass
 
+
 def make_lora_request(lora_id: int):
     return LoRARequest(lora_name=f"{lora_id}",
                        lora_int_id=lora_id,
@@ -36,14 +36,13 @@ def test_lora_functions_sync():
     max_loras = 4
     # Create engine in eager-mode. Due to high max_loras, the CI can
     # OOM during cuda-graph capture.
-    engine_args = EngineArgs(
-        model=MODEL_PATH,
-        enable_lora=True,
-        max_loras=max_loras,
-        max_lora_rank=LORA_RANK,
-        max_model_len=128,
-        gpu_memory_utilization=0.8,  #avoid OOM
-        enforce_eager=True)
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_lora=True,
+                             max_loras=max_loras,
+                             max_lora_rank=LORA_RANK,
+                             max_model_len=128,
+                             gpu_memory_utilization=0.8,
+                             enforce_eager=True)
 
     llm = LLM.get_engine_class().from_engine_args(engine_args)
 
@@ -56,7 +55,7 @@ def run_check(fn, args, expected: List):
 
     # Pin LoRA 1 and test that it is never removed on subsequent adds.
     run_check(llm.pin_lora, 1, [1, 2])
-    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) 
+    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
     run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
     run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
     run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
@@ -83,7 +82,8 @@ async def test_lora_functions_async():
 
     if os.getenv("VLLM_USE_V1") == "0":
         pytest.skip(
-        reason=f"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
+            reason=
+            "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
 
     # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
     # environment variable. reload vllm.enging.async_llm_engine as
@@ -97,14 +97,13 @@ async def test_lora_functions_async():
         build_async_engine_client_from_engine_args)
 
     max_loras = 4
-    engine_args = AsyncEngineArgs(
-        model=MODEL_PATH,
-        enable_lora=True,
-        max_loras=max_loras,
-        max_lora_rank=LORA_RANK,
-        max_model_len=128,
-        gpu_memory_utilization=0.8, 
-        enforce_eager=True)
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  enable_lora=True,
+                                  max_loras=max_loras,
+                                  max_lora_rank=LORA_RANK,
+                                  max_model_len=128,
+                                  gpu_memory_utilization=0.8,
+                                  enforce_eager=True)
 
     async def run_check(fn, args, expected: List):
         await fn(args)
@@ -116,7 +115,7 @@ async def run_check(fn, args, expected: List):
 
         # Pin LoRA 1 and test that it is never removed on subsequent adds.
         await run_check(llm.pin_lora, 1, [1, 2])
-        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3]) 
+        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
         await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
         await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
         await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
@@ -135,4 +134,4 @@ async def run_check(fn, args, expected: List):
         await run_check(llm.remove_lora, 13, [12, 10, 11])
         await run_check(llm.remove_lora, 12, [10, 11])
         await run_check(llm.remove_lora, 11, [10])
-        await run_check(llm.remove_lora, 10, [])
\ No newline at end of file
+        await run_check(llm.remove_lora, 10, [])
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4253497b5100..6577a21c15ee 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Mapping, Optional, Type, Union, Set
+from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union
 
 import numpy as np
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 7533463d8ee9..041896f1c7cc 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,7 +7,7 @@
 from concurrent.futures import Future
 from inspect import isclass, signature
 from multiprocessing.connection import Connection
-from typing import Any, List, Optional, Tuple, Type, Set
+from typing import Any, List, Optional, Set, Tuple, Type
 
 import msgspec
 import psutil
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 67c3dc5f1a20..0812dcd65a27 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Mapping, Optional, Type, Union, Set
+from typing import Dict, List, Mapping, Optional, Set, Type, Union
 
 from typing_extensions import TypeVar
 

From c3102c4b7c9017283839e7f25a2ccc19d74d1df8 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varun@neuralmagic.com>
Date: Sun, 23 Feb 2025 13:00:02 -0500
Subject: [PATCH 3/4] remove whitespace

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/test_add_lora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index a31e309213d5..788717804987 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -145,7 +145,7 @@ async def test_add_lora():
 
         # Run with warmup
         add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
-        add_lora_results = await asyncio.gather(*add_lora_tasks) 
+        add_lora_results = await asyncio.gather(*add_lora_tasks)
         # Test that all all_lora calls are successful
         assert all(add_lora_results)
         time_with_add_lora = await requests_processing_time(

From a020418d73dcd54885c8f38660c8e2d4c8c528c5 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varun@neuralmagic.com>
Date: Tue, 25 Feb 2025 01:02:29 -0500
Subject: [PATCH 4/4] fix tests

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/test_add_lora.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 788717804987..70b058b201d6 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -7,6 +7,7 @@
 import pytest
 from huggingface_hub import snapshot_download
 
+import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
@@ -146,8 +147,12 @@ async def test_add_lora():
         # Run with warmup
         add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
         add_lora_results = await asyncio.gather(*add_lora_tasks)
-        # Test that all all_lora calls are successful
-        assert all(add_lora_results)
+        if env.VLLM_USE_V1:
+            # Test that all all_lora calls are successful.
+            assert all(add_lora_results)
+        else:
+            # No way to check V0 engine results as the calls just return None.
+            pass
         time_with_add_lora = await requests_processing_time(
             llm, warmup_run_requests)