Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
5dee54d
Add support for encoder embedding models
maxdebayser Jun 23, 2025
7eb9d28
Fix CUDA graphs for BERT models
maxdebayser Jul 1, 2025
67691e0
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 1, 2025
d3099a9
Fix cuda graph initialization of token type ids
maxdebayser Jul 1, 2025
613ff3b
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 2, 2025
20c41e4
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 2, 2025
ba86026
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 8, 2025
b4f5ead
Fix missing args
maxdebayser Jul 9, 2025
c4060d1
relax assertion
maxdebayser Jul 9, 2025
01d2a65
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 9, 2025
80930d8
fix missing arg
maxdebayser Jul 9, 2025
d881f0a
fix missing arg
maxdebayser Jul 10, 2025
90a25d0
remove model from unsupported list
maxdebayser Jul 10, 2025
6686550
fix missing arg
maxdebayser Jul 10, 2025
cc76777
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 10, 2025
136c9b3
fix tests
maxdebayser Jul 10, 2025
b232491
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 14, 2025
cf5e6b8
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
e19c738
fix tests
maxdebayser Jul 16, 2025
e255f30
fix tests
maxdebayser Jul 16, 2025
ee5950c
add missing arg
maxdebayser Jul 16, 2025
78a2e57
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
a5cfc84
add missing arg
maxdebayser Jul 16, 2025
63fd783
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 16, 2025
f58692c
Merge branch 'main' into v1_embeddings_full
maxdebayser Jul 20, 2025
eea55fb
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 25, 2025
f2d8e18
Merge branch 'v1_embeddings_full' of github.com:maxdebayser/vllm into…
maxdebayser Jul 25, 2025
12ae080
revert attn changes to simplify merge
maxdebayser Jul 28, 2025
f29da32
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 28, 2025
f0c67f6
fix case of models without tokenizer
maxdebayser Jul 28, 2025
b62a51a
Merge branch 'upstream_main' into v1_embeddings_full
maxdebayser Jul 29, 2025
cc970ab
simplify score code
maxdebayser Jul 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tests/models/language/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@
"The capital of Germany is Berlin.",
]


@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass


DTYPE = "half"


Expand Down
21 changes: 11 additions & 10 deletions tests/tokenization/test_detokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,17 @@ def _run_incremental_decode(tokenizer,
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
)
request = EngineCoreRequest("",
prompt_token_ids,
None,
None,
None,
params,
None,
None,
0.0,
None,
request = EngineCoreRequest(request_id="",
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
sampling_params=params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None)

Expand Down
1 change: 1 addition & 0 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def make_request(request_id,
return Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def make_request(request_id,
return Request(
request_id=request_id,
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
Expand Down
2 changes: 2 additions & 0 deletions tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,7 @@ def create_requests_with_priority(
request = Request(
request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
token_type_ids=None,
sampling_params=sampling_params,
pooling_params=None,
multi_modal_inputs=mm_inputs,
Expand Down Expand Up @@ -1816,6 +1817,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
request = Request(
request_id="0",
prompt_token_ids=[0, 1],
token_type_ids=None,
multi_modal_inputs=None,
multi_modal_hashes=None,
multi_modal_placeholders=None,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def create_requests(
request = Request(
request_id=f"{i}",
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
sampling_params=sampling_params,
pooling_params=None,
multi_modal_inputs=mm_inputs,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def make_request() -> EngineCoreRequest:
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
prompt_token_ids=PROMPT_TOKENS,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def make_request(
return EngineCoreRequest(
request_id=str(uuid.uuid4()),
prompt_token_ids=prompt_tokens_ids,
token_type_ids=None,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/engine/test_fast_incdec_prefix_err.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
None,
None,
None,
None,
params,
None,
None,
Expand Down
5 changes: 5 additions & 0 deletions tests/v1/engine/test_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
requests = [
EngineCoreRequest(request_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -401,6 +402,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
requests = [
EngineCoreRequest(request_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -566,6 +568,7 @@ def test_stop_token(include_stop_str_in_output: bool,
request = EngineCoreRequest(
request_id=request_id,
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -665,6 +668,7 @@ def test_stop_string(include_stop_str_in_output: bool,
EngineCoreRequest(
request_id=request_id_list[idx],
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down Expand Up @@ -781,6 +785,7 @@ def test_iteration_stats(dummy_test_vectors):
EngineCoreRequest(
request_id=f"request-{idx}",
prompt_token_ids=prompt_tokens,
token_type_ids=None,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/kv_connector/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def create_request(
req = Request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
token_type_ids=None,
sampling_params=sampling_params,
pooling_params=None,
multi_modal_inputs=None,
Expand Down
1 change: 1 addition & 0 deletions tests/v1/tpu/worker/test_tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
NewRequestData(
req_id=req_id,
prompt_token_ids=[1, 2, 3],
token_type_ids=None,
mm_inputs=[],
mm_hashes=[],
mm_positions=[],
Expand Down
4 changes: 4 additions & 0 deletions tests/v1/worker/test_gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,17 @@ def _construct_cached_request_state(req_id_suffix: int):
np.random.randint(0, VOCAB_SIZE)
for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
]
token_type_ids = [
np.random.randint(0, 2) for _ in range(len(prompt_token_ids))
]
output_token_ids = [
np.random.randint(0, VOCAB_SIZE)
for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
]
return CachedRequestState(
req_id=f"req_id_{req_id_suffix}",
prompt_token_ids=prompt_token_ids,
token_type_ids=token_type_ids,
sampling_params=_create_sampling_params(),
pooling_params=None,
mm_inputs=[],
Expand Down
1 change: 1 addition & 0 deletions tests/v1/worker/test_gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
NewRequestData(
req_id=req_id,
prompt_token_ids=[1, 2, 3],
token_type_ids=None,
mm_inputs=[],
mm_hashes=[],
mm_positions=[],
Expand Down
38 changes: 11 additions & 27 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1269,34 +1269,18 @@ def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if model_config.is_multimodal_model:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)
model_config = self.llm_engine.model_config

parsed_prompts.append(engine_prompt)
else:
for q, t in input_pairs:
if model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(
text=q, # type: ignore[arg-type]
text_pair=t, # type: ignore[arg-type]
**tokenization_kwargs)
else:
# `llm as reranker` models defaults to not using pad_token.
prompt_inputs = tokenizer(
text=q + t, # type: ignore[operator]
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
)

parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
Expand Down
63 changes: 13 additions & 50 deletions vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,56 +188,19 @@ async def _cross_encoding_score(

input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]

if self.model_config.is_multimodal_model:

preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)

preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

else:
tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor)
use_pad_token = self.model_config.use_pad_token

if use_pad_token:
# cross_encoder models defaults to using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1, # type: ignore[arg-type]
text_pair=t2, # type: ignore[arg-type]
**tokenization_kwargs) for t1, t2 in input_pairs))
else:
# `llm as reranker` models defaults to not using pad_token.
tokenized_prompts = await asyncio.gather(*(
tokenize_async(
text=t1 + # type: ignore[operator]
t2,
**tokenization_kwargs) for t1, t2 in input_pairs))

for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
sep_token = tokenizer.sep_token if (tokenizer.sep_token
and use_pad_token) else ''
request_prompt = f"{t1}{sep_token}{t2}"

input_ids = prompt_inputs["input_ids"]
text_token_prompt = \
self._validate_input(request, input_ids, request_prompt)
engine_prompt = TokensPrompt(
prompt_token_ids=text_token_prompt["prompt_token_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))

request_prompts.append(request_prompt)
engine_prompts.append(engine_prompt)
preprocess_async = make_async(self._preprocess_score,
executor=self._tokenizer_executor)

preprocessed_prompts = await asyncio.gather(
*(preprocess_async(request=request,
tokenizer=tokenizer,
tokenization_kwargs=tokenization_kwargs,
data_1=t1,
data_2=t2) for t1, t2 in input_pairs))

for full_prompt, engine_prompt in preprocessed_prompts:
request_prompts.append(full_prompt)
engine_prompts.append(engine_prompt)

# Schedule the request and get the result generator.
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
Expand Down
21 changes: 18 additions & 3 deletions vllm/entrypoints/score_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,28 @@ def get_score_prompt(
model_config,
tokenizer,
)
from vllm.model_executor.model_loader import get_model_cls

full_prompt = apply_score_template(model_config, prompt_1, prompt_2)

prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
model = get_model_cls(model_config)
if supports_score_template(model):
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
elif model_config.use_pad_token:
# cross_encoder models defaults to using pad_token.
prompt_inputs = tokenizer(text=prompt_1,
text_pair=prompt_2,
**tokenization_kwargs)
full_prompt = tokenizer.decode(prompt_inputs["input_ids"])
else:
# `llm as reranker` models defaults to not using pad_token.
full_prompt = prompt_1 + prompt_2
prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs)

engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["input_ids"])

if (token_type_ids := prompt_inputs.get("token_type_ids")) is not None:
engine_prompt["token_type_ids"] = token_type_ids

post_process_tokens(model_config, engine_prompt)

if mm_data is not None:
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from vllm.sequence import IntermediateTensors
from vllm.tasks import PoolingTask

from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
from .interfaces import SupportsCrossEncoding, SupportsQuant
from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix


Expand Down Expand Up @@ -508,8 +508,8 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
})


class BertForSequenceClassification(nn.Module, SupportsV0Only,
SupportsCrossEncoding, SupportsQuant):
class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
SupportsQuant):
"""A model that uses Bert to provide embedding functionalities.

This class encapsulates the BertModel and provides an interface for
Expand Down
5 changes: 2 additions & 3 deletions vllm/model_executor/models/roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from vllm.sequence import IntermediateTensors

from .bert_with_rope import BertWithRope, JinaRobertaModel
from .interfaces import SupportsCrossEncoding, SupportsV0Only
from .interfaces import SupportsCrossEncoding


class RobertaEmbedding(nn.Module):
Expand Down Expand Up @@ -153,8 +153,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
return loader.load_weights(weights_list, mapper=mapper)


class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
SupportsV0Only):
class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
"""A model that uses Roberta to provide embedding functionalities.

This class encapsulates the BertModel and provides an interface for
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/core/sched/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class NewRequestData:

req_id: str
prompt_token_ids: list[int]
token_type_ids: Optional[list[int]]
mm_inputs: list[MultiModalKwargs]
mm_hashes: list[str]
mm_positions: list[PlaceholderRange]
Expand All @@ -42,6 +43,7 @@ def from_request(
return cls(
req_id=request.request_id,
prompt_token_ids=request.prompt_token_ids,
token_type_ids=request.token_type_ids,
mm_inputs=request.mm_inputs,
mm_hashes=request.mm_hashes,
mm_positions=request.mm_positions,
Expand Down
Loading