Skip to content

Commit a065959

Browse files
zhuohan123xuebwang-amd
authored andcommitted
[Core] Remove tokenizer group in vLLM (vllm-project#24078)
Signed-off-by: Zhuohan Li <[email protected]> Signed-off-by: xuebwang-amd <[email protected]>
1 parent e339281 commit a065959

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+276
-934
lines changed

tests/detokenizer/test_stop_checker.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
from unittest.mock import MagicMock
5-
64
import pytest
7-
from transformers import PreTrainedTokenizer
85

96
from vllm.engine.output_processor.stop_checker import StopChecker
107
from vllm.inputs import token_inputs
@@ -54,10 +51,7 @@ def test_stop_on_eos_token(text_wo_eos: str, eos_token: str, eos_token_id: int,
5451
- When the EOS token should be ignored, and the sequence continues
5552
"""
5653

57-
tokenizer = MagicMock(spec=PreTrainedTokenizer)
58-
get_tokenizer_for_seq = MagicMock(return_value=tokenizer)
59-
stop_checker = StopChecker(max_model_len=1024,
60-
get_tokenizer_for_seq=get_tokenizer_for_seq)
54+
stop_checker = StopChecker(max_model_len=1024)
6155

6256
seq = sequence_with_eos(
6357
text=text_wo_eos,

tests/engine/test_stop_checker.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,13 @@ def deepseek_r1_qwen_tokenizer():
5858

5959
@pytest.fixture
6060
def stop_checker():
61-
return StopChecker(max_model_len=10,
62-
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer)
61+
return StopChecker(max_model_len=10)
6362

6463

6564
@pytest.fixture
6665
def stop_checker_with_reasoner():
6766
reasoner = MockReasoningParser(deepseek_r1_qwen_tokenizer)
68-
return StopChecker(max_model_len=10,
69-
get_tokenizer_for_seq=deepseek_r1_qwen_tokenizer,
70-
reasoner=reasoner)
67+
return StopChecker(max_model_len=10, reasoner=reasoner)
7168

7269

7370
def test_eos_token_stopping(stop_checker):

tests/entrypoints/conftest.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -208,25 +208,3 @@ def zephyr_lora_files():
208208
"""Download zephyr LoRA files once per test session."""
209209
from huggingface_hub import snapshot_download
210210
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
211-
212-
213-
@pytest.fixture(scope="session")
214-
def zephyr_lora_added_tokens_files(zephyr_lora_files):
215-
"""Create zephyr LoRA files with added tokens once per test session."""
216-
import shutil
217-
from tempfile import TemporaryDirectory
218-
219-
from transformers import AutoTokenizer
220-
221-
tmp_dir = TemporaryDirectory()
222-
tmp_model_dir = f"{tmp_dir.name}/zephyr"
223-
shutil.copytree(zephyr_lora_files, tmp_model_dir)
224-
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
225-
# Copy tokenizer to adapter and add some unique tokens
226-
# 32000, 32001, 32002
227-
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
228-
special_tokens=True)
229-
assert added == 3
230-
tokenizer.save_pretrained(tmp_model_dir)
231-
yield tmp_model_dir
232-
tmp_dir.cleanup()

tests/entrypoints/openai/test_chat.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,7 @@ def monkeypatch_module():
2929

3030

3131
@pytest.fixture(scope="module", params=[False, True])
32-
def server(
33-
request,
34-
monkeypatch_module,
35-
zephyr_lora_files, #noqa: F811
36-
zephyr_lora_added_tokens_files): # noqa: F811
32+
def server(request, monkeypatch_module, zephyr_lora_files): #noqa: F811
3733

3834
use_v1 = request.param
3935
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
@@ -49,7 +45,6 @@ def server(
4945
"--enable-lora",
5046
"--lora-modules",
5147
f"zephyr-lora={zephyr_lora_files}",
52-
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
5348
"--max-lora-rank",
5449
"64",
5550
"--max-cpu-loras",
@@ -79,7 +74,7 @@ async def client(server):
7974
@pytest.mark.parametrize(
8075
# first test base model, then test loras
8176
"model_name",
82-
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
77+
[MODEL_NAME, "zephyr-lora"],
8378
)
8479
async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str):
8580
messages = [{

tests/entrypoints/openai/test_completion.py

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828

2929
@pytest.fixture(scope="module")
30-
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
30+
def default_server_args(zephyr_lora_files):
3131
return [
3232
# use half precision for speed and memory savings in CI environment
3333
"--dtype",
@@ -41,7 +41,6 @@ def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
4141
"--enable-lora",
4242
"--lora-modules",
4343
f"zephyr-lora={zephyr_lora_files}",
44-
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
4544
"--max-lora-rank",
4645
"64",
4746
"--max-cpu-loras",
@@ -87,7 +86,7 @@ async def client(server):
8786
@pytest.mark.parametrize(
8887
# first test base model, then test loras
8988
"model_name",
90-
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
89+
[MODEL_NAME, "zephyr-lora"],
9190
)
9291
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
9392
completion = await client.completions.create(model=model_name,
@@ -115,20 +114,6 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
115114
assert completion.choices[0].prompt_logprobs is None
116115

117116

118-
@pytest.mark.asyncio
119-
async def test_added_lora_tokens(client: openai.AsyncOpenAI):
120-
# test using token IDs
121-
completion = await client.completions.create(
122-
model="zephyr-lora2",
123-
prompt=[0, 0, 32000, 32001, 32002],
124-
echo=True,
125-
max_tokens=5,
126-
temperature=0.0,
127-
)
128-
# Added tokens should appear in tokenized prompt
129-
assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
130-
131-
132117
@pytest.mark.asyncio
133118
async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
134119
# test using token IDs
@@ -147,7 +132,7 @@ async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
147132
@pytest.mark.parametrize(
148133
# first test base model, then test loras
149134
"model_name",
150-
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
135+
[MODEL_NAME, "zephyr-lora"],
151136
)
152137
async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
153138
# test using token IDs
@@ -713,7 +698,7 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
713698
@pytest.mark.parametrize(
714699
# first test base model, then test loras
715700
"model_name",
716-
[MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
701+
[MODEL_NAME, "zephyr-lora"],
717702
)
718703
@pytest.mark.parametrize("logprobs_arg", [1, 0])
719704
async def test_echo_logprob_completion(client: openai.AsyncOpenAI,

tests/entrypoints/openai/test_completion_with_prompt_embeds.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,7 @@
2121

2222

2323
@pytest.fixture(scope="module")
24-
def default_server_args(
25-
zephyr_lora_files,
26-
zephyr_lora_added_tokens_files,
27-
) -> list[str]:
24+
def default_server_args() -> list[str]:
2825
return [
2926
# use half precision for speed and memory savings in CI environment
3027
"--dtype",

tests/entrypoints/openai/test_lora_adapters.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,12 +67,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
6767
"base_model_name": MODEL_NAME
6868
}
6969

70-
lora_module_2 = {
71-
"name": "zephyr-lora2",
72-
"path": zephyr_lora_files,
73-
"base_model_name": MODEL_NAME
74-
}
75-
7670
args = [
7771
# use half precision for speed and memory savings in CI environment
7872
"--dtype",
@@ -84,7 +78,6 @@ def server_with_lora_modules_json(request, monkeypatch_module,
8478
"--enable-lora",
8579
"--lora-modules",
8680
json.dumps(lora_module_1),
87-
json.dumps(lora_module_2),
8881
"--max-lora-rank",
8982
"64",
9083
"--max-cpu-loras",
@@ -121,7 +114,6 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI,
121114
for lora_model in lora_models)
122115
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
123116
assert lora_models[0].id == "zephyr-lora"
124-
assert lora_models[1].id == "zephyr-lora2"
125117

126118

127119
@pytest.mark.asyncio
@@ -209,7 +201,7 @@ async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
209201
@pytest.mark.asyncio
210202
async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
211203
zephyr_lora_files):
212-
"""Validate that many loras can be dynamically registered and inferenced
204+
"""Validate that many loras can be dynamically registered and inferenced
213205
with concurrently"""
214206

215207
# This test file configures the server with --max-cpu-loras=2 and this test

tests/entrypoints/openai/test_models.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ def server(zephyr_lora_files):
2626
"--enable-lora",
2727
"--lora-modules",
2828
f"zephyr-lora={zephyr_lora_files}",
29-
f"zephyr-lora2={zephyr_lora_files}",
3029
"--max-lora-rank",
3130
"64",
3231
"--max-cpu-loras",
@@ -56,4 +55,3 @@ async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
5655
assert all(lora_model.root == zephyr_lora_files
5756
for lora_model in lora_models)
5857
assert lora_models[0].id == "zephyr-lora"
59-
assert lora_models[1].id == "zephyr-lora2"

tests/entrypoints/openai/test_tokenization.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
@pytest.fixture(scope="module")
17-
def server(zephyr_lora_added_tokens_files: str): # noqa: F811
17+
def server():
1818
args = [
1919
# use half precision for speed and memory savings in CI environment
2020
"--dtype",
@@ -24,12 +24,6 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
2424
"--enforce-eager",
2525
"--max-num-seqs",
2626
"128",
27-
# lora config
28-
"--enable-lora",
29-
"--lora-modules",
30-
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
31-
"--max-lora-rank",
32-
"64",
3327
"--enable-tokenizer-info-endpoint",
3428
]
3529

@@ -38,10 +32,8 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
3832

3933

4034
@pytest.fixture(scope="module")
41-
def tokenizer_name(model_name: str,
42-
zephyr_lora_added_tokens_files: str): # noqa: F811
43-
return zephyr_lora_added_tokens_files if (
44-
model_name == "zephyr-lora2") else model_name
35+
def tokenizer_name(model_name: str):
36+
return model_name
4537

4638

4739
@pytest_asyncio.fixture
@@ -53,7 +45,7 @@ async def client(server):
5345
@pytest.mark.asyncio
5446
@pytest.mark.parametrize(
5547
"model_name,tokenizer_name",
56-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
48+
[(MODEL_NAME, MODEL_NAME)],
5749
indirect=["tokenizer_name"],
5850
)
5951
async def test_tokenize_completions(
@@ -86,7 +78,7 @@ async def test_tokenize_completions(
8678
@pytest.mark.asyncio
8779
@pytest.mark.parametrize(
8880
"model_name,tokenizer_name",
89-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
81+
[(MODEL_NAME, MODEL_NAME)],
9082
indirect=["tokenizer_name"],
9183
)
9284
async def test_tokenize_chat(
@@ -148,7 +140,7 @@ async def test_tokenize_chat(
148140
@pytest.mark.asyncio
149141
@pytest.mark.parametrize(
150142
"model_name,tokenizer_name",
151-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
143+
[(MODEL_NAME, MODEL_NAME)],
152144
indirect=["tokenizer_name"],
153145
)
154146
async def test_tokenize_chat_with_tools(
@@ -225,7 +217,7 @@ async def test_tokenize_chat_with_tools(
225217
@pytest.mark.asyncio
226218
@pytest.mark.parametrize(
227219
"model_name, tokenizer_name",
228-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
220+
[(MODEL_NAME, MODEL_NAME)],
229221
indirect=["tokenizer_name"],
230222
)
231223
async def test_tokenize_with_return_token_strs(
@@ -260,7 +252,7 @@ async def test_tokenize_with_return_token_strs(
260252
@pytest.mark.asyncio
261253
@pytest.mark.parametrize(
262254
"model_name,tokenizer_name",
263-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
255+
[(MODEL_NAME, MODEL_NAME)],
264256
indirect=["tokenizer_name"],
265257
)
266258
async def test_detokenize(
@@ -287,7 +279,7 @@ async def test_detokenize(
287279
@pytest.mark.asyncio
288280
@pytest.mark.parametrize(
289281
"model_name,tokenizer_name",
290-
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
282+
[(MODEL_NAME, MODEL_NAME)],
291283
indirect=["tokenizer_name"],
292284
)
293285
async def test_tokenizer_info_basic(
@@ -384,4 +376,4 @@ async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
384376
if chat_template:
385377
assert isinstance(chat_template,
386378
str), ("Chat template should be a string")
387-
assert chat_template.strip(), "Chat template should not be empty"
379+
assert chat_template.strip(), "Chat template should not be empty"

tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
"--enable-lora",
1919
"--lora-modules",
2020
f"{LORA_MODEL}={LORA_MODEL}",
21+
"--tokenizer",
22+
f"{LORA_MODEL}",
2123
]
2224

2325
TOOLS = [{

0 commit comments

Comments
 (0)