xiao-llm
diff --git a/‎tests/entrypoints/llm/test_classify.py‎
Lines changed: 67 additions & 0 deletions b/‎tests/entrypoints/llm/test_classify.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎tests/entrypoints/llm/test_embedding.py‎
Lines changed: 56 additions & 0 deletions b/‎tests/entrypoints/llm/test_embedding.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎tests/entrypoints/llm/test_reward.py‎
Lines changed: 66 additions & 0 deletions b/‎tests/entrypoints/llm/test_reward.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎tests/entrypoints/llm/test_score.py‎
Lines changed: 69 additions & 0 deletions b/‎tests/entrypoints/llm/test_score.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_classification.py‎
Lines changed: 31 additions & 0 deletions b/‎tests/entrypoints/openai/test_classification.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_embedding.py‎
Lines changed: 34 additions & 0 deletions b/‎tests/entrypoints/openai/test_embedding.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎tests/entrypoints/openai/test_rerank.py‎
Lines changed: 38 additions & 0 deletions b/‎tests/entrypoints/openai/test_rerank.py‎
Lines changed: 38 additions & 0 deletions
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(activation):
+        outputs = llm.classify(
+            prompts,
+            pooling_params=PoolingParams(activation=activation),
+            use_tqdm=False)
+        return torch.tensor([x.outputs.probs for x in outputs])
+
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        softmax(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(normalize):
+        outputs = llm.embed(prompts,
+                            pooling_params=PoolingParams(normalize=normalize),
+                            use_tqdm=False)
+        return torch.tensor([x.outputs.embedding for x in outputs])
+
+    default = get_outputs(normalize=None)
+    w_normal = get_outputs(normalize=True)
+    wo_normal = get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal,
+                          atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal,
+                              atol=1e-2), "wo_normal should not use normal."
+    assert torch.allclose(
+        w_normal, F.normalize(wo_normal, p=2, dim=-1),
+        atol=1e-2), "w_normal should be close to normal(wo_normal)."
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "internlm/internlm2-1_8b-reward"
+
+prompts = ["The chef prepared a delicious meal."]
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              trust_remote_code=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(softmax):
+        outputs = llm.reward(prompts,
+                             pooling_params=PoolingParams(softmax=softmax),
+                             use_tqdm=False)
+        return torch.cat([x.outputs.data for x in outputs])
+
+    default = get_outputs(softmax=None)
+    w_softmax = get_outputs(softmax=True)
+    wo_softmax = get_outputs(softmax=False)
+
+    assert torch.allclose(default, w_softmax,
+                          atol=1e-2), "Default should use softmax."
+    assert not torch.allclose(w_softmax, wo_softmax,
+                              atol=1e-2), "wo_softmax should not use softmax."
+    assert torch.allclose(
+        softmax(wo_softmax), w_softmax,
+        atol=1e-2), "w_softmax should be close to softmax(wo_softmax)."
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import weakref
+
+import pytest
+import torch
+
+from vllm import LLM, PoolingParams
+from vllm.distributed import cleanup_dist_env_and_memory
+
+from ...models.utils import softmax
+
+MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=32768,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.75,
+              enforce_eager=True,
+              seed=0)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skip_global_cleanup
+def test_pooling_params(llm: LLM):
+
+    def get_outputs(activation):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        outputs = llm.score(
+            text_1,
+            text_2,
+            pooling_params=PoolingParams(activation=activation),
+            use_tqdm=False)
+        return torch.tensor([x.outputs.score for x in outputs])
+
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        softmax(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
@@ -3,6 +3,8 @@
 
 import pytest
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import ClassificationResponse
 
@@ -181,3 +183,32 @@ async def test_invocations(server: RemoteOpenAIServer):
         assert classification_data.keys() == invocation_data.keys()
         assert classification_data["probs"] == pytest.approx(
             invocation_data["probs"], rel=0.01)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_activation(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["This product was excellent and exceeded my expectations"]
+
+    async def get_outputs(activation):
+        response = requests.post(server.url_for("classify"),
+                                 json={
+                                     "model": model_name,
+                                     "input": input_text,
+                                     "activation": activation
+                                 })
+        outputs = response.json()
+        return torch.tensor([x['probs'] for x in outputs["data"]])
+
+    default = await get_outputs(activation=None)
+    w_activation = await get_outputs(activation=True)
+    wo_activation = await get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        F.softmax(wo_activation, dim=-1), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."
@@ -8,6 +8,8 @@
 import pytest
 import pytest_asyncio
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
@@ -369,3 +371,35 @@ async def test_invocations_conversation(server: RemoteOpenAIServer):
                                embeddings_1_lst=[invocation_data["embedding"]],
                                name_0="chat",
                                name_1="invocation")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_normalize(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    async def get_outputs(normalize):
+        request_args = {
+            "model": MODEL_NAME,
+            "input": input_text,
+            "encoding_format": "float",
+            "normalize": normalize
+        }
+
+        response = requests.post(server.url_for("v1/embeddings"),
+                                 json=request_args)
+        outputs = response.json()
+
+        return torch.tensor([x['embedding'] for x in outputs["data"]])
+
+    default = await get_outputs(normalize=None)
+    w_normal = await get_outputs(normalize=True)
+    wo_normal = await get_outputs(normalize=False)
+
+    assert torch.allclose(default, w_normal,
+                          atol=1e-2), "Default should use normal."
+    assert not torch.allclose(w_normal, wo_normal,
+                              atol=1e-2), "wo_normal should not use normal."
+    assert torch.allclose(
+        w_normal, F.normalize(wo_normal, p=2, dim=-1),
+        atol=1e-2), "w_normal should be close to normal(wo_normal)."
@@ -3,6 +3,8 @@
 
 import pytest
 import requests
+import torch
+import torch.nn.functional as F
 
 from vllm.entrypoints.openai.protocol import RerankResponse
 
@@ -125,3 +127,39 @@ def test_invocations(server: RemoteOpenAIServer):
         assert rerank_result.keys() == invocations_result.keys()
         assert rerank_result["relevance_score"] == pytest.approx(
             invocations_result["relevance_score"], rel=0.01)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_activation(server: RemoteOpenAIServer, model_name: str):
+
+    async def get_outputs(activation):
+        query = "What is the capital of France?"
+        documents = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        response = requests.post(server.url_for("rerank"),
+                                 json={
+                                     "model": model_name,
+                                     "query": query,
+                                     "documents": documents,
+                                     "activation": activation
+                                 })
+        outputs = response.json()
+
+        return torch.tensor([x['relevance_score'] for x in outputs["results"]])
+
+    default = await get_outputs(activation=None)
+    w_activation = await get_outputs(activation=True)
+    wo_activation = await get_outputs(activation=False)
+
+    assert torch.allclose(default, w_activation,
+                          atol=1e-2), "Default should use activation."
+    assert not torch.allclose(
+        w_activation, wo_activation,
+        atol=1e-2), "wo_activation should not use activation."
+    assert torch.allclose(
+        F.sigmoid(wo_activation), w_activation, atol=1e-2
+    ), "w_activation should be close to activation(wo_activation)."