Skip to content

Commit e8afd10

Browse files
committed
Merge branch 'main' into mm-fields
2 parents 4cac998 + 3f3e92e commit e8afd10

File tree

20 files changed

+293
-237
lines changed

20 files changed

+293
-237
lines changed

Dockerfile.cpu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,20 @@ RUN pip install intel_extension_for_pytorch==2.5.0
2626

2727
WORKDIR /workspace
2828

29+
COPY requirements-build.txt requirements-build.txt
2930
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
3031
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
3132
RUN --mount=type=cache,target=/root/.cache/pip \
32-
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
3333
pip install --upgrade pip && \
3434
pip install -r requirements-build.txt
3535

3636
FROM cpu-test-1 AS build
3737

3838
WORKDIR /workspace/vllm
3939

40+
COPY requirements-common.txt requirements-common.txt
41+
COPY requirements-cpu.txt requirements-cpu.txt
4042
RUN --mount=type=cache,target=/root/.cache/pip \
41-
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
42-
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
4343
pip install -v -r requirements-cpu.txt
4444

4545
COPY . .

docs/source/models/supported_models.md

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ llm = LLM(model=..., task="generate") # Name or path of your model
2828
output = llm.generate("Hello, my name is")
2929
print(output)
3030
31-
# For pooling models (task={embed,classify,reward}) only
31+
# For pooling models (task={embed,classify,reward,score}) only
3232
llm = LLM(model=..., task="embed") # Name or path of your model
3333
output = llm.encode("Hello, my name is")
3434
print(output)
@@ -59,7 +59,7 @@ llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
5959
output = llm.generate("Hello, my name is")
6060
print(output)
6161

62-
# For pooling models (task={embed,classify,reward}) only
62+
# For pooling models (task={embed,classify,reward,score}) only
6363
output = llm.encode("Hello, my name is")
6464
print(output)
6565
```
@@ -369,14 +369,6 @@ you should explicitly specify the task type to ensure that the model is used in
369369

370370
#### Text Embedding (`--task embed`)
371371

372-
Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
373-
374-
```{note}
375-
To get the best results, you should use pooling models that are specifically trained as such.
376-
```
377-
378-
The following table lists those that are tested in vLLM.
379-
380372
```{eval-rst}
381373
.. list-table::
382374
:widths: 25 25 50 5 5
@@ -437,6 +429,10 @@ On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`
437429
despite being described otherwise on its model card.
438430
```
439431

432+
If your model is not in the above list, we will try to automatically convert the model using
433+
:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
434+
of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
435+
440436
#### Reward Modeling (`--task reward`)
441437

442438
```{eval-rst}
@@ -461,6 +457,9 @@ despite being described otherwise on its model card.
461457
- ✅︎
462458
```
463459

460+
If your model is not in the above list, we will try to automatically convert the model using
461+
:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
462+
464463
```{important}
465464
For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
466465
e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
@@ -490,6 +489,9 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
490489
- ✅︎
491490
```
492491

492+
If your model is not in the above list, we will try to automatically convert the model using
493+
:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
494+
493495
#### Sentence Pair Scoring (`--task score`)
494496

495497
```{eval-rst}

requirements-openvino.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@
44
torch == 2.5.1 # should be aligned with "common" vLLM torch version
55
openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
66

7-
optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
8-
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
7+
optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
8+
optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version

setup.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,13 @@ def get_gaudi_sw_version():
455455

456456

457457
def get_vllm_version() -> str:
458-
version = get_version(
459-
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
460-
)
458+
# TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
459+
try:
460+
version = get_version(
461+
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
462+
)
463+
except LookupError:
464+
version = "0.0.0"
461465

462466
sep = "+" if "+" not in version else "." # dev versions might contain +
463467

tests/models/embedding/language/test_cls_models.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
"""Compare the outputs of HF and vLLM when using greedy sampling.
2-
3-
This test only tests small models. Big models such as 7B should be tested from
4-
test_big_models.py because it could use a larger instance to run tests.
1+
"""Compare the classification outputs of HF and vLLM models.
52
63
Run `pytest tests/models/test_cls_models.py`.
74
"""

tests/models/embedding/language/test_scoring.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
"""Compare the embedding outputs of HF and vLLM models.
1+
"""Compare the scoring outputs of HF and vLLM models.
22
3-
Run `pytest tests/models/embedding/language/test_embedding.py`.
3+
Run `pytest tests/models/embedding/language/test_scoring.py`.
44
"""
55
import math
66

tests/models/test_registry.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from vllm.model_executor.models import (is_pooling_model,
77
is_text_generation_model,
88
supports_multimodal)
9-
from vllm.model_executor.models.adapters import as_embedding_model
9+
from vllm.model_executor.models.adapters import (as_classification_model,
10+
as_embedding_model,
11+
as_reward_model)
1012
from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
1113
_SPECULATIVE_DECODING_MODELS,
1214
_TEXT_GENERATION_MODELS,
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
2931
or model_arch in _MULTIMODAL_MODELS):
3032
assert is_text_generation_model(model_cls)
3133

32-
# All vLLM models should be convertible to an embedding model
33-
embed_model = as_embedding_model(model_cls)
34-
assert is_pooling_model(embed_model)
34+
# All vLLM models should be convertible to a pooling model
35+
assert is_pooling_model(as_classification_model(model_cls))
36+
assert is_pooling_model(as_embedding_model(model_cls))
37+
assert is_pooling_model(as_reward_model(model_cls))
3538

3639
if model_arch in _MULTIMODAL_MODELS:
3740
assert supports_multimodal(model_cls)

tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414

1515
class MyGemma2Embedding(nn.Module):
16+
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
1617

1718
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
1819
super().__init__()
@@ -62,8 +63,8 @@ def pooler(
6263
return self._pooler(hidden_states, pooling_metadata)
6364

6465
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
65-
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
66-
weights = hf_to_vllm_mapper.apply(weights)
66+
67+
weights = self.hf_to_vllm_mapper.apply(weights)
6768
weights = ((name, data) for name, data in weights
6869
if not name.startswith("lm_head."))
6970
return self.model.load_weights(weights)

vllm/attention/layer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def __init__(
191191
kv_cache_dtype=None,
192192
block_size=16,
193193
is_attention_free=False)
194+
attn_backend = backend_name_to_enum(attn_backend.get_name())
194195
if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
195196
attn_backend = _Backend.XFORMERS
196197

vllm/model_executor/model_loader/utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77

88
from vllm.config import ModelConfig
99
from vllm.model_executor.models import ModelRegistry
10-
from vllm.model_executor.models.adapters import as_embedding_model
10+
from vllm.model_executor.models.adapters import (as_classification_model,
11+
as_embedding_model,
12+
as_reward_model)
1113

1214

1315
@contextlib.contextmanager
@@ -35,8 +37,12 @@ def get_model_architecture(
3537
architectures = ["QuantMixtralForCausalLM"]
3638

3739
model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
38-
if model_config.runner_type == "pooling":
40+
if model_config.task == "embed":
3941
model_cls = as_embedding_model(model_cls)
42+
elif model_config.task == "classify":
43+
model_cls = as_classification_model(model_cls)
44+
elif model_config.task == "reward":
45+
model_cls = as_reward_model(model_cls)
4046

4147
return model_cls, arch
4248

0 commit comments

Comments
 (0)