DarkLight1337
diff --git a/‎Dockerfile.cpu‎
Lines changed: 3 additions & 3 deletions b/‎Dockerfile.cpu‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/models/supported_models.md‎
Lines changed: 12 additions & 10 deletions b/‎docs/source/models/supported_models.md‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎requirements-openvino.txt‎
Lines changed: 2 additions & 2 deletions b/‎requirements-openvino.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 7 additions & 3 deletions b/‎setup.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎tests/models/embedding/language/test_cls_models.py‎
Lines changed: 1 addition & 4 deletions b/‎tests/models/embedding/language/test_cls_models.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎tests/models/embedding/language/test_scoring.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/models/embedding/language/test_scoring.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/models/test_registry.py‎
Lines changed: 7 additions & 4 deletions b/‎tests/models/test_registry.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vllm/attention/layer.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/attention/layer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/model_executor/model_loader/utils.py‎
Lines changed: 8 additions & 2 deletions b/‎vllm/model_executor/model_loader/utils.py‎
Lines changed: 8 additions & 2 deletions
@@ -26,20 +26,20 @@ RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
+COPY requirements-build.txt requirements-build.txt
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
     pip install --upgrade pip && \
     pip install -r requirements-build.txt
 
 FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cpu.txt requirements-cpu.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
     pip install -v -r requirements-cpu.txt
 
 COPY . .
 
@@ -28,7 +28,7 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward}) only
+# For pooling models (task={embed,classify,reward,score}) only
 llm = LLM(model=..., task="embed")  # Name or path of your model
 output = llm.encode("Hello, my name is")
 print(output)
@@ -59,7 +59,7 @@ llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
 output = llm.generate("Hello, my name is")
 print(output)
 
-# For pooling models (task={embed,classify,reward}) only
+# For pooling models (task={embed,classify,reward,score}) only
 output = llm.encode("Hello, my name is")
 print(output)
 ```
@@ -369,14 +369,6 @@ you should explicitly specify the task type to ensure that the model is used in
 
 #### Text Embedding (`--task embed`)
 
-Any text generation model can be converted into an embedding model by passing {code}`--task embed`.
-
-```{note}
-To get the best results, you should use pooling models that are specifically trained as such.
-```
-
-The following table lists those that are tested in vLLM.
-
 ```{eval-rst}
 .. list-table::
   :widths: 25 25 50 5 5
@@ -437,6 +429,10 @@ On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`
 despite being described otherwise on its model card.
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
+of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
+
 #### Reward Modeling (`--task reward`)
 
 ```{eval-rst}
@@ -461,6 +457,9 @@ despite being described otherwise on its model card.
     - ✅︎
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
+
 ```{important}
 For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
 e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
@@ -490,6 +489,9 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1
     - ✅︎
 ```
 
+If your model is not in the above list, we will try to automatically convert the model using
+:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+
 #### Sentence Pair Scoring (`--task score`)
 
 ```{eval-rst}
 
@@ -4,5 +4,5 @@
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
 
-optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
+optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
@@ -455,9 +455,13 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    version = get_version(
-        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-    )
+    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
+    try:
+        version = get_version(
+            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+        )
+    except LookupError:
+        version = "0.0.0"
 
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
 
@@ -1,7 +1,4 @@
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
+"""Compare the classification outputs of HF and vLLM models.
 
 Run `pytest tests/models/test_cls_models.py`.
 """
 
@@ -1,6 +1,6 @@
-"""Compare the embedding outputs of HF and vLLM models.
+"""Compare the scoring outputs of HF and vLLM models.
 
-Run `pytest tests/models/embedding/language/test_embedding.py`.
+Run `pytest tests/models/embedding/language/test_scoring.py`.
 """
 import math
 
 
@@ -6,7 +6,9 @@
 from vllm.model_executor.models import (is_pooling_model,
                                         is_text_generation_model,
                                         supports_multimodal)
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
                                                  _SPECULATIVE_DECODING_MODELS,
                                                  _TEXT_GENERATION_MODELS,
@@ -29,9 +31,10 @@ def test_registry_imports(model_arch):
             or model_arch in _MULTIMODAL_MODELS):
         assert is_text_generation_model(model_cls)
 
-    # All vLLM models should be convertible to an embedding model
-    embed_model = as_embedding_model(model_cls)
-    assert is_pooling_model(embed_model)
+    # All vLLM models should be convertible to a pooling model
+    assert is_pooling_model(as_classification_model(model_cls))
+    assert is_pooling_model(as_embedding_model(model_cls))
+    assert is_pooling_model(as_reward_model(model_cls))
 
     if model_arch in _MULTIMODAL_MODELS:
         assert supports_multimodal(model_cls)
 
@@ -13,6 +13,7 @@
 
 
 class MyGemma2Embedding(nn.Module):
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -62,8 +63,8 @@ def pooler(
         return self._pooler(hidden_states, pooling_metadata)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
-        weights = hf_to_vllm_mapper.apply(weights)
+
+        weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
                    if not name.startswith("lm_head."))
         return self.model.load_weights(weights)
@@ -191,6 +191,7 @@ def __init__(
                                         kv_cache_dtype=None,
                                         block_size=16,
                                         is_attention_free=False)
+        attn_backend = backend_name_to_enum(attn_backend.get_name())
         if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
             attn_backend = _Backend.XFORMERS
 
 
@@ -7,7 +7,9 @@
 
 from vllm.config import ModelConfig
 from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.adapters import as_embedding_model
+from vllm.model_executor.models.adapters import (as_classification_model,
+                                                 as_embedding_model,
+                                                 as_reward_model)
 
 
 @contextlib.contextmanager
@@ -35,8 +37,12 @@ def get_model_architecture(
         architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
-    if model_config.runner_type == "pooling":
+    if model_config.task == "embed":
         model_cls = as_embedding_model(model_cls)
+    elif model_config.task == "classify":
+        model_cls = as_classification_model(model_cls)
+    elif model_config.task == "reward":
+        model_cls = as_reward_model(model_cls)
 
     return model_cls, arch