Skip to content

Commit 9919b4f

Browse files
varun-sundar-rabindranathVarun Sundar Rabindranath
authored andcommitted
[Misc] Enable V1 LoRA by default (vllm-project#15320)
Signed-off-by: Varun Sundar Rabindranath <[email protected]> Co-authored-by: Varun Sundar Rabindranath <[email protected]>
1 parent b20981d commit 9919b4f

File tree

12 files changed

+125
-87
lines changed

12 files changed

+125
-87
lines changed

tests/entrypoints/openai/test_chat.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,23 @@
2424

2525

2626
@pytest.fixture(scope="module")
27-
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
27+
def monkeypatch_module():
28+
from _pytest.monkeypatch import MonkeyPatch
29+
mpatch = MonkeyPatch()
30+
yield mpatch
31+
mpatch.undo()
32+
33+
34+
@pytest.fixture(scope="module", params=[False, True])
35+
def server(
36+
request,
37+
monkeypatch_module,
38+
zephyr_lora_files, #noqa: F811
39+
zephyr_lora_added_tokens_files): # noqa: F811
40+
41+
use_v1 = request.param
42+
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
43+
2844
args = [
2945
# use half precision for speed and memory savings in CI environment
3046
"--dtype",
@@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
4965
yield remote_server
5066

5167

68+
@pytest.fixture
69+
def is_v1_server(server):
70+
import os
71+
assert os.environ['VLLM_USE_V1'] in ['0', '1']
72+
return os.environ['VLLM_USE_V1'] == '1'
73+
74+
5275
@pytest_asyncio.fixture
5376
async def client(server):
5477
async with server.get_async_client() as async_client:
@@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
471494
@pytest.mark.asyncio
472495
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
473496
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
497+
is_v1_server: bool,
474498
guided_decoding_backend: str,
475499
sample_guided_choice):
500+
501+
if is_v1_server and guided_decoding_backend != 'xgrammar':
502+
pytest.skip("Only xgrammar backend is supported with V1")
503+
476504
messages = [{
477505
"role": "system",
478506
"content": "you are a helpful assistant"
@@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
511539

512540
@pytest.mark.asyncio
513541
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
514-
async def test_guided_json_chat(client: openai.AsyncOpenAI,
542+
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
515543
guided_decoding_backend: str,
516544
sample_json_schema):
545+
546+
if is_v1_server:
547+
pytest.skip("sample_json_schema has features unsupported in V1")
548+
517549
messages = [{
518550
"role": "system",
519551
"content": "you are a helpful assistant"
@@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
559591
@pytest.mark.asyncio
560592
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
561593
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
594+
is_v1_server: bool,
562595
guided_decoding_backend: str, sample_regex):
596+
597+
if is_v1_server and guided_decoding_backend != 'xgrammar':
598+
pytest.skip("Only xgrammar backend is supported with V1")
599+
563600
messages = [{
564601
"role": "system",
565602
"content": "you are a helpful assistant"
@@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
617654
@pytest.mark.asyncio
618655
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
619656
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
657+
is_v1_server: bool,
620658
guided_decoding_backend: str,
621659
sample_guided_choice):
660+
661+
if is_v1_server and guided_decoding_backend != 'xgrammar':
662+
pytest.skip("Only xgrammar backend is supported with V1")
663+
622664
messages = [{
623665
"role": "system",
624666
"content": "you are a helpful assistant"
@@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
648690

649691
@pytest.mark.asyncio
650692
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
651-
async def test_named_tool_use(client: openai.AsyncOpenAI,
693+
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
652694
guided_decoding_backend: str,
653695
sample_json_schema):
696+
697+
if is_v1_server:
698+
pytest.skip("sample_json_schema has features unsupported on V1")
699+
654700
messages = [{
655701
"role": "system",
656702
"content": "you are a helpful assistant"
@@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
742788
@pytest.mark.asyncio
743789
async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
744790
sample_json_schema):
791+
792+
if is_v1_server:
793+
pytest.skip("sample_json_schema has features unsupported on V1")
794+
745795
messages = [{
746796
"role": "system",
747797
"content": "you are a helpful assistant"
@@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
787837
@pytest.mark.asyncio
788838
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
789839
sample_json_schema):
840+
841+
if is_v1_server:
842+
pytest.skip("sample_json_schema has features unsupported on V1")
843+
790844
messages = [{
791845
"role": "system",
792846
"content": "you are a helpful assistant"

tests/lora/test_baichuan.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@
1111
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
1212

1313

14+
@pytest.fixture(autouse=True)
15+
def v1(run_with_both_engines_lora):
16+
# Simple autouse wrapper to run both engines for each test
17+
# This can be promoted up to conftest.py to run for every
18+
# test in a package
19+
pass
20+
21+
1422
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
1523
prompts = [
1624
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
4048
return generated_texts
4149

4250

43-
@pytest.fixture(autouse=True)
44-
def v1(run_with_both_engines_lora):
45-
# Simple autouse wrapper to run both engines for each test
46-
# This can be promoted up to conftest.py to run for every
47-
# test in a package
48-
pass
49-
50-
5151
def test_baichuan_lora(baichuan_lora_files):
5252
llm = vllm.LLM(MODEL_PATH,
5353
max_model_len=1024,

tests/lora/test_chatglm3_tp.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@
1818
]
1919

2020

21+
@pytest.fixture(autouse=True)
22+
def v1(run_with_both_engines_lora):
23+
# Simple autouse wrapper to run both engines for each test
24+
# This can be promoted up to conftest.py to run for every
25+
# test in a package
26+
pass
27+
28+
2129
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
2230
prompts = [
2331
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
4654
return generated_texts
4755

4856

49-
@pytest.fixture(autouse=True)
50-
def v1(run_with_both_engines_lora):
51-
# Simple autouse wrapper to run both engines for each test
52-
# This can be promoted up to conftest.py to run for every
53-
# test in a package
54-
pass
55-
56-
5757
@create_new_process_for_each_test()
5858
def test_chatglm3_lora(chatglm3_lora_files):
5959
llm = vllm.LLM(MODEL_PATH,

tests/lora/test_gemma.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,14 @@
99
MODEL_PATH = "google/gemma-7b"
1010

1111

12+
@pytest.fixture(autouse=True)
13+
def v1(run_with_both_engines_lora):
14+
# Simple autouse wrapper to run both engines for each test
15+
# This can be promoted up to conftest.py to run for every
16+
# test in a package
17+
pass
18+
19+
1220
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
1321
prompts = [
1422
"Quote: Imagination is",
@@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
3139
return generated_texts
3240

3341

34-
@pytest.fixture(autouse=True)
35-
def v1(run_with_both_engines_lora):
36-
# Simple autouse wrapper to run both engines for each test
37-
# This can be promoted up to conftest.py to run for every
38-
# test in a package
39-
pass
40-
41-
4242
# The V1 lora test for this model requires more than 24GB.
4343
@pytest.mark.skip_v1
4444
@pytest.mark.xfail(current_platform.is_rocm(),

tests/lora/test_layers.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3-
import importlib
43
import random
54
from copy import deepcopy
65
from dataclasses import dataclass
@@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora):
8281
# This can be promoted up to conftest.py to run for every
8382
# test in a package
8483

85-
# Reload punica_gpu as the kernels used are tied to engine type.
86-
from vllm.lora.punica_wrapper import punica_gpu
87-
importlib.reload(punica_gpu)
88-
8984
# Release any memory we might be holding on to. CI runs OOMs otherwise.
9085
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
9186
_LORA_B_PTR_DICT)

tests/lora/test_llama_tp.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@
2828
]
2929

3030

31+
@pytest.fixture(autouse=True)
32+
def v1(run_with_both_engines_lora):
33+
# Simple autouse wrapper to run both engines for each test
34+
# This can be promoted up to conftest.py to run for every
35+
# test in a package
36+
pass
37+
38+
3139
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
3240
prompts = [
3341
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
@@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
7179
print("removing lora")
7280

7381

74-
@pytest.fixture(autouse=True)
75-
def v1(run_with_both_engines_lora):
76-
# Simple autouse wrapper to run both engines for each test
77-
# This can be promoted up to conftest.py to run for every
78-
# test in a package
79-
pass
80-
81-
82-
# V1 Test: Failing due to numerics on V1.
83-
@pytest.mark.skip_v1
8482
@create_new_process_for_each_test()
8583
def test_llama_lora(sql_lora_files):
8684

@@ -126,8 +124,6 @@ def get_num_gpu_blocks_no_lora():
126124
"less when using lora than when not using lora")
127125

128126

129-
# V1 Test: Failing due to numerics on V1.
130-
@pytest.mark.skip_v1
131127
@multi_gpu_test(num_gpus=4)
132128
@create_new_process_for_each_test()
133129
def test_llama_lora_tp4(sql_lora_files):

tests/lora/test_lora_manager.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from safetensors.torch import load_file
88
from torch import nn
99

10-
from vllm import envs
1110
from vllm.config import LoRAConfig
1211
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
1312
MergedColumnParallelLinearWithLoRA,
@@ -33,6 +32,17 @@
3332
] if current_platform.is_cuda_alike() else ["cpu"])
3433

3534

35+
@pytest.fixture(scope="function", autouse=True)
36+
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
37+
"""
38+
Some tests depend on V0 internals. Since both V0 and V1 use the same
39+
LoRAModelManager it is okay to just test V0.
40+
"""
41+
with monkeypatch.context() as m:
42+
m.setenv('VLLM_USE_V1', '0')
43+
yield
44+
45+
3646
@pytest.mark.parametrize("device", DEVICES)
3747
def test_from_lora_tensors(sql_lora_files, device):
3848
tensors = load_file(
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
411421
assert manager.device == device
412422

413423

414-
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
415424
@pytest.mark.parametrize("device", DEVICES)
416425
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
417426
sql_lora_files, device):
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
491500
device)
492501

493502

494-
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
495503
@pytest.mark.parametrize("device", DEVICES)
496504
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
497505
sql_lora_files, device):

tests/lora/test_phi.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@
1010
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
1111

1212

13+
@pytest.fixture(autouse=True)
14+
def v1(run_with_both_engines_lora):
15+
# Simple autouse wrapper to run both engines for each test
16+
# This can be promoted up to conftest.py to run for every
17+
# test in a package
18+
pass
19+
20+
1321
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
1422
prompts = [
1523
PROMPT_TEMPLATE.format(
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
4856
return generated_texts
4957

5058

51-
@pytest.fixture(autouse=True)
52-
def v1(run_with_both_engines_lora):
53-
# Simple autouse wrapper to run both engines for each test
54-
# This can be promoted up to conftest.py to run for every
55-
# test in a package
56-
pass
57-
58-
5959
# Skipping for V1 for now as we are hitting,
6060
# "Head size 80 is not supported by FlashAttention." error.
6161
@pytest.mark.skip_v1

tests/lora/test_quant_model.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ class ModelWithQuantization:
3737
]
3838

3939

40+
@pytest.fixture(autouse=True)
41+
def v1(run_with_both_engines_lora):
42+
# Simple autouse wrapper to run both engines for each test
43+
# This can be promoted up to conftest.py to run for every
44+
# test in a package
45+
pass
46+
47+
4048
def do_sample(llm: vllm.LLM,
4149
lora_path: str,
4250
lora_id: int,
@@ -69,14 +77,6 @@ def format_prompt_tuples(prompt):
6977
return generated_texts
7078

7179

72-
@pytest.fixture(autouse=True)
73-
def v1(run_with_both_engines_lora):
74-
# Simple autouse wrapper to run both engines for each test
75-
# This can be promoted up to conftest.py to run for every
76-
# test in a package
77-
pass
78-
79-
8080
@pytest.mark.parametrize("model", MODELS)
8181
@pytest.mark.parametrize("tp_size", [1])
8282
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,

0 commit comments

Comments
 (0)