Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions tests/detokenizer/test_stop_checker.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from unittest.mock import MagicMock

import pytest
from transformers import PreTrainedTokenizer

from vllm.engine.output_processor.stop_checker import StopChecker
from vllm.inputs import token_inputs
Expand Down Expand Up @@ -54,10 +51,7 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
- When the EOS token should be ignored, and the sequence continues
"""

tokenizer = MagicMock(spec=PreTrainedTokenizer)
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
stop_checker = StopChecker(max_model_len=1024,
get_tokenizer_for_seq=get_tokenizer_for_seq)
stop_checker = StopChecker(max_model_len=1024)

seq = sequence_with_eos(
text=text_wo_eos,
Expand Down
7 changes: 2 additions & 5 deletions tests/engine/test_stop_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,13 @@ def deepseek_r1_qwen_tokenizer():

@pytest.fixture
def stop_checker():
return StopChecker(max_model_len=10,
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer)
return StopChecker(max_model_len=10)


@pytest.fixture
def stop_checker_with_reasoner():
reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
return StopChecker(max_model_len=10,
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer,
reasoner=reasoner)
return StopChecker(max_model_len=10, reasoner=reasoner)


def test_eos_token_stopping(stop_checker):
Expand Down
22 changes: 0 additions & 22 deletions tests/entrypoints/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,25 +208,3 @@ def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")


@pytest.fixture(scope="session")
def zephyr_lora_added_tokens_files(zephyr_lora_files):
"""Create zephyr LoRA files with added tokens once per test session."""
import shutil
from tempfile import TemporaryDirectory

from transformers import AutoTokenizer

tmp_dir = TemporaryDirectory()
tmp_model_dir = f"{tmp_dir.name}/zephyr"
shutil.copytree(zephyr_lora_files, tmp_model_dir)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
special_tokens=True)
assert added == 3
tokenizer.save_pretrained(tmp_model_dir)
yield tmp_model_dir
tmp_dir.cleanup()
9 changes: 2 additions & 7 deletions tests/entrypoints/openai/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,7 @@ def monkeypatch_module():


@pytest.fixture(scope="module", params=[False, True])
def server(
request,
monkeypatch_module,
zephyr_lora_files, #noqa: F811
zephyr_lora_added_tokens_files): # noqa: F811
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811

use_v1 = request.param
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
Expand All @@ -49,7 +45,6 @@ def server(
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand Down Expand Up @@ -79,7 +74,7 @@ async def client(server):
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
[MODEL_NAME, "zephyr-lora"],
)
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
messages = [{
Expand Down
23 changes: 4 additions & 19 deletions tests/entrypoints/openai/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


@pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
def default_server_args(zephyr_lora_files):
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -41,7 +41,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand Down Expand Up @@ -87,7 +86,7 @@ async def client(server):
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
[MODEL_NAME, "zephyr-lora"],
)
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name,
Expand Down Expand Up @@ -115,20 +114,6 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
assert completion.choices[0].prompt_logprobs is None


@pytest.mark.asyncio
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
# test using token IDs
completion = await client.completions.create(
model="zephyr-lora2",
prompt=[0, 0, 32000, 32001, 32002],
echo=True,
max_tokens=5,
temperature=0.0,
)
# Added tokens should appear in tokenized prompt
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")


@pytest.mark.asyncio
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
# test using token IDs
Expand All @@ -147,7 +132,7 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
[MODEL_NAME, "zephyr-lora"],
)
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
# test using token IDs
Expand Down Expand Up @@ -713,7 +698,7 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
@pytest.mark.parametrize(
# first test base model, then test loras
"model_name",
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
[MODEL_NAME, "zephyr-lora"],
)
@pytest.mark.parametrize("logprobs_arg", [1, 0])
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@


@pytest.fixture(scope="module")
def default_server_args(
zephyr_lora_files,
zephyr_lora_added_tokens_files,
) -> list[str]:
def default_server_args() -> list[str]:
return [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand Down
10 changes: 1 addition & 9 deletions tests/entrypoints/openai/test_lora_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
"base_model_name": MODEL_NAME
}

lora_module_2 = {
"name": "zephyr-lora2",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}

args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -84,7 +78,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
"--enable-lora",
"--lora-modules",
json.dumps(lora_module_1),
json.dumps(lora_module_2),
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand Down Expand Up @@ -121,7 +114,6 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"


@pytest.mark.asyncio
Expand Down Expand Up @@ -209,7 +201,7 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
@pytest.mark.asyncio
async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
zephyr_lora_files):
"""Validate that many loras can be dynamically registered and inferenced
"""Validate that many loras can be dynamically registered and inferenced
with concurrently"""

# This test file configures the server with --max-cpu-loras=2 and this test
Expand Down
2 changes: 0 additions & 2 deletions tests/entrypoints/openai/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def server(zephyr_lora_files):
"--enable-lora",
"--lora-modules",
f"zephyr-lora={zephyr_lora_files}",
f"zephyr-lora2={zephyr_lora_files}",
"--max-lora-rank",
"64",
"--max-cpu-loras",
Expand Down Expand Up @@ -56,4 +55,3 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
28 changes: 10 additions & 18 deletions tests/entrypoints/openai/test_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


@pytest.fixture(scope="module")
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand All @@ -24,12 +24,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
"--enforce-eager",
"--max-num-seqs",
"128",
# lora config
"--enable-lora",
"--lora-modules",
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
"--max-lora-rank",
"64",
"--enable-tokenizer-info-endpoint",
]

Expand All @@ -38,10 +32,8 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811


@pytest.fixture(scope="module")
def tokenizer_name(model_name: str,
zephyr_lora_added_tokens_files: str): # noqa: F811
return zephyr_lora_added_tokens_files if (
model_name == "zephyr-lora2") else model_name
def tokenizer_name(model_name: str):
return model_name


@pytest_asyncio.fixture
Expand All @@ -53,7 +45,7 @@ async def client(server):
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_completions(
Expand Down Expand Up @@ -86,7 +78,7 @@ async def test_tokenize_completions(
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat(
Expand Down Expand Up @@ -148,7 +140,7 @@ async def test_tokenize_chat(
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat_with_tools(
Expand Down Expand Up @@ -225,7 +217,7 @@ async def test_tokenize_chat_with_tools(
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name, tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenize_with_return_token_strs(
Expand Down Expand Up @@ -260,7 +252,7 @@ async def test_tokenize_with_return_token_strs(
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_detokenize(
Expand All @@ -287,7 +279,7 @@ async def test_detokenize(
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name,tokenizer_name",
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
[(MODEL_NAME, MODEL_NAME)],
indirect=["tokenizer_name"],
)
async def test_tokenizer_info_basic(
Expand Down Expand Up @@ -384,4 +376,4 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
if chat_template:
assert isinstance(chat_template,
str), ("Chat template should be a string")
assert chat_template.strip(), "Chat template should not be empty"
assert chat_template.strip(), "Chat template should not be empty"
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
"--enable-lora",
"--lora-modules",
f"{LORA_MODEL}={LORA_MODEL}",
"--tokenizer",
f"{LORA_MODEL}",
]

TOOLS = [{
Expand Down
Loading