From 0d23917edbe6a3934ad31a402980c1d6bb81bfdf Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 12:48:58 -0700 Subject: [PATCH 01/15] rebase Signed-off-by: Linkun Chen --- .../model_config/qwen_3_8B_fp8_1replica.yaml | 19 ++++++++++++++++ .../configs/serve_qwen3_8B_fp8_1p1d.yaml | 7 ++++++ release/release_tests.yaml | 22 +++++++++++++++++++ 3 files changed, 48 insertions(+) create mode 100644 release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml create mode 100644 release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml new file mode 100644 index 00000000000..e8d57ef74d8 --- /dev/null +++ b/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml @@ -0,0 +1,19 @@ +model_loading_config: + model_id: Qwen/Qwen3-8B-FP8 + model_source: Qwen/Qwen3-8B-FP8 + +accelerator_type: L40S + +# Test V1 at the same time +runtime_env: + env_vars: + VLLM_USE_V1: "1" + +engine_kwargs: + # Need eager mode to suppress https://github.com/vllm-project/vllm/issues/18244 + enforce_eager: True + data_parallel_size: 1 + tensor_parallel_size: 1 + +deployment_config: + num_replicas: 1 diff --git a/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml b/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml new file mode 100644 index 00000000000..b7e3f3bc2ba --- /dev/null +++ b/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml @@ -0,0 +1,7 @@ +applications: + - args: + prefill_config: ./configs/model_config/qwen_3_8B_fp8_1replica.yaml + decode_config: ./configs/model_config/qwen_3_8B_fp8_1replica.yaml + import_path: ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg:build_app + name: llm-endpoint + route_prefix: / diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 8296c3c3117..8573a783267 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4304,6 +4304,28 @@ long_running: false script: pytest -vs test_llm_serve_integration.py +- name: llm_serve_qwen_3_8B_fp8_1p1d + frequency: nightly + python: "3.11" + group: llm-serve + team: llm + working_dir: llm_tests/serve + + cluster: + byod: + type: llm-cu124 + # TODO(lk-chen): remove once we bump vllm to 0.9.0 + post_build_script: byod_llm_pd_disagg_test.sh + cluster_compute: llm_auto_select_worker.yaml + # NOTE: Important for getting the correct secrets + cloud_id: cld_wy5a6nhazplvu32526ams61d98 + project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq + + run: + timeout: 3600 + long_running: false + script: python run_llm_serve_release_tests.py --serve-config-file configs/serve_qwen3_8B_fp8_1p1d.yaml --skip-hf-token true + ############## # LLM Batch From 7cfa1ba63c2ffdbb0ca2bdbed3f7330c8fe643b1 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 12:52:28 -0700 Subject: [PATCH 02/15] add requiremnts Signed-off-by: Linkun Chen --- release/ray_release/byod/byod_llm_pd_disagg_test.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 release/ray_release/byod/byod_llm_pd_disagg_test.sh diff --git a/release/ray_release/byod/byod_llm_pd_disagg_test.sh b/release/ray_release/byod/byod_llm_pd_disagg_test.sh new file mode 100644 index 00000000000..fbd7b3d2421 --- /dev/null +++ b/release/ray_release/byod/byod_llm_pd_disagg_test.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# This script is used to build an extra layer on top of the base llm image +# to install vllm at specific version that includes necessary changes for +# PD-disaggregated serving. This script also installs necessary packages. + +set -exo pipefail + +# https://github.com/vllm-project/vllm/pull/17751 (Nixl Integration. May 12) +pip3 install "vllm@https://wheels.vllm.ai/d19110204c03e9b77ed957fc70c1262ff370f5e2/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" From 1aeb5b45c2e5d7f9bfde05d484314b4a5762d512 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 12:52:28 -0700 Subject: [PATCH 03/15] fix Signed-off-by: Linkun Chen --- .../serve/deployments/llm/vllm/vllm_engine.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 57a6915deec..fa654e823ab 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -213,12 +213,12 @@ def __init__( # We need to overwrite the engine_id to make it unique across replicas. # "engine_id" is added in vllm 0.9.0, so do existance check. - if "engine_id" in kv_transfer_config.model_fields: - engine_id = getattr(kv_transfer_config, "engine_id", uuid.uuid4()) - host = vllm.envs.NIXL_SIDE_CHANNEL_HOST - port = vllm.envs.NIXL_SIDE_CHANNEL_PORT - kv_transfer_config.engine_id = "-".join([engine_id, host, port]) - else: + try: + engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4())) + host = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost") + port = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_PORT", 5557) + kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) + except ValueError: # TODO(lk-chen): Raise error once vllm 0.9.0 is pinned to rayllm logger.warning( "engine_id is not supported in vllm < 0.9.0, NIXL-backed kv transfer " From 0bdc0012d968b3c8333c406885b11cd4c99857b4 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 18:18:09 -0700 Subject: [PATCH 04/15] update Docker Signed-off-by: Linkun Chen --- docker/ray-llm/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/ray-llm/Dockerfile b/docker/ray-llm/Dockerfile index eaf155da7ac..a3014d69765 100644 --- a/docker/ray-llm/Dockerfile +++ b/docker/ray-llm/Dockerfile @@ -75,6 +75,8 @@ sudo apt-get install -y kmod pkg-config librdmacm-dev ( echo "Installing UCX" + # Needed by UCX + sudo apt-get install -y librdmacm-dev cd "${TEMP_DIR}" wget "https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz" -q tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz From 206c8f984f35e6db39c16bf22abc6078863c336b Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 20:55:36 -0700 Subject: [PATCH 05/15] +x Signed-off-by: Linkun Chen --- release/ray_release/byod/byod_llm_pd_disagg_test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100644 => 100755 release/ray_release/byod/byod_llm_pd_disagg_test.sh diff --git a/release/ray_release/byod/byod_llm_pd_disagg_test.sh b/release/ray_release/byod/byod_llm_pd_disagg_test.sh old mode 100644 new mode 100755 index fbd7b3d2421..708fafdf7dd --- a/release/ray_release/byod/byod_llm_pd_disagg_test.sh +++ b/release/ray_release/byod/byod_llm_pd_disagg_test.sh @@ -1,9 +1,10 @@ #!/bin/bash # This script is used to build an extra layer on top of the base llm image # to install vllm at specific version that includes necessary changes for -# PD-disaggregated serving. This script also installs necessary packages. +# PD-disaggregated serving. set -exo pipefail # https://github.com/vllm-project/vllm/pull/17751 (Nixl Integration. May 12) -pip3 install "vllm@https://wheels.vllm.ai/d19110204c03e9b77ed957fc70c1262ff370f5e2/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" +pip3 install --no-cache-dir \ + "vllm@https://wheels.vllm.ai/d19110204c03e9b77ed957fc70c1262ff370f5e2/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" From 33b1015441b4a92b3f38fdfbeaeab8cd4aae3d5d Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 22:07:36 -0700 Subject: [PATCH 06/15] filename change in another PR Signed-off-by: Linkun Chen --- release/release_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 8573a783267..49c9dc2d5a6 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4324,7 +4324,7 @@ run: timeout: 3600 long_running: false - script: python run_llm_serve_release_tests.py --serve-config-file configs/serve_qwen3_8B_fp8_1p1d.yaml --skip-hf-token true + script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_qwen3_8B_fp8_1p1d.yaml --skip-hf-token true ############## From 5defd57544c3b592f025cb8cc35d5ff5b10153b4 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Fri, 23 May 2025 22:19:05 -0700 Subject: [PATCH 07/15] fix for new config schema Signed-off-by: Linkun Chen --- release/llm_tests/serve/benchmark/common.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/release/llm_tests/serve/benchmark/common.py b/release/llm_tests/serve/benchmark/common.py index 3c1bac146d0..a9a0109a921 100644 --- a/release/llm_tests/serve/benchmark/common.py +++ b/release/llm_tests/serve/benchmark/common.py @@ -3,7 +3,7 @@ import json import logging import os -from typing import Dict, List, Any +from typing import Any, Dict, List from urllib.parse import urlparse import boto3 @@ -17,13 +17,20 @@ ) -def get_llm_config(serve_config_file: List[Dict]) -> List[Any]: +def get_llm_config(serve_config_file: str) -> Dict[str, Any]: """Get the first llm_config from serve config file.""" with open(serve_config_file, "r") as f: loaded_llm_config = yaml.safe_load(f) - applications = loaded_llm_config["applications"] - config = applications[0]["args"]["llm_configs"][0] + application = loaded_llm_config["applications"][0] + assert "args" in application, f"Application must contain an 'args' key, got {application}" + if "llm_configs" in application["args"]: + config = application["args"]["llm_configs"][0] + elif "prefill_config" in application["args"]: + config = application["args"]["prefill_config"] + else: + raise ValueError(f"Unrecognized serve config schema: {application['args']}") + if isinstance(config, dict): return config From 5a6d4fc48f9803bd82287fe1ba3040cc259167d4 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Sat, 24 May 2025 11:12:45 -0700 Subject: [PATCH 08/15] remove 8B, use 0.6B Signed-off-by: Linkun Chen --- release/llm_tests/serve/benchmark/common.py | 4 +++- .../model_config/qwen_3_0dot6B_1replica.yaml | 4 ++-- .../model_config/qwen_3_8B_fp8_1replica.yaml | 19 ------------------- .../configs/serve_qwen3_8B_fp8_1p1d.yaml | 7 ------- release/release_tests.yaml | 2 +- 5 files changed, 6 insertions(+), 30 deletions(-) delete mode 100644 release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml delete mode 100644 release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml diff --git a/release/llm_tests/serve/benchmark/common.py b/release/llm_tests/serve/benchmark/common.py index a9a0109a921..fb42bc08201 100644 --- a/release/llm_tests/serve/benchmark/common.py +++ b/release/llm_tests/serve/benchmark/common.py @@ -23,7 +23,9 @@ def get_llm_config(serve_config_file: str) -> Dict[str, Any]: loaded_llm_config = yaml.safe_load(f) application = loaded_llm_config["applications"][0] - assert "args" in application, f"Application must contain an 'args' key, got {application}" + assert ( + "args" in application + ), f"Application must contain an 'args' key, got {application}" if "llm_configs" in application["args"]: config = application["args"]["llm_configs"][0] elif "prefill_config" in application["args"]: diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml index 2474aa83963..4d32d267ab4 100644 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml @@ -1,6 +1,6 @@ model_loading_config: - model_id: Qwen/Qwen3-0.6B - model_source: Qwen/Qwen3-0.6B + model_id: Qwen3-0.6B + model_source: s3://air-example-data/rayllm-ossci/qwen3-0.6b accelerator_type: L40S diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml deleted file mode 100644 index e8d57ef74d8..00000000000 --- a/release/llm_tests/serve/configs/model_config/qwen_3_8B_fp8_1replica.yaml +++ /dev/null @@ -1,19 +0,0 @@ -model_loading_config: - model_id: Qwen/Qwen3-8B-FP8 - model_source: Qwen/Qwen3-8B-FP8 - -accelerator_type: L40S - -# Test V1 at the same time -runtime_env: - env_vars: - VLLM_USE_V1: "1" - -engine_kwargs: - # Need eager mode to suppress https://github.com/vllm-project/vllm/issues/18244 - enforce_eager: True - data_parallel_size: 1 - tensor_parallel_size: 1 - -deployment_config: - num_replicas: 1 diff --git a/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml b/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml deleted file mode 100644 index b7e3f3bc2ba..00000000000 --- a/release/llm_tests/serve/configs/serve_qwen3_8B_fp8_1p1d.yaml +++ /dev/null @@ -1,7 +0,0 @@ -applications: - - args: - prefill_config: ./configs/model_config/qwen_3_8B_fp8_1replica.yaml - decode_config: ./configs/model_config/qwen_3_8B_fp8_1replica.yaml - import_path: ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg:build_app - name: llm-endpoint - route_prefix: / diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 49c9dc2d5a6..ebea7087598 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4324,7 +4324,7 @@ run: timeout: 3600 long_running: false - script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_qwen3_8B_fp8_1p1d.yaml --skip-hf-token true + script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_qwen3_0dot6B_1p1d.yaml --skip-hf-token true ############## From b3c170841b06fc59bfb8518cd21a4ae40d018a01 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Sun, 25 May 2025 01:04:57 -0700 Subject: [PATCH 09/15] remove nv driver Signed-off-by: Linkun Chen --- .../serve/configs/model_config/qwen_3_0dot6B_1replica.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml index 4d32d267ab4..63a13cbd80e 100644 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml @@ -1,6 +1,7 @@ model_loading_config: model_id: Qwen3-0.6B - model_source: s3://air-example-data/rayllm-ossci/qwen3-0.6b + model_source: + bucket_uri: s3://air-example-data/rayllm-ossci/qwen3-0.6b accelerator_type: L40S From 1c4b71ed475649320718f42491cee1b99d0bcaf0 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 10:16:54 -0700 Subject: [PATCH 10/15] fix test for qwen Signed-off-by: Linkun Chen --- .../serve/configs/model_config/qwen_3_0dot6B_1replica.yaml | 1 + release/llm_tests/serve/probes/test_exact_correctness.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml index 63a13cbd80e..b8ec1015533 100644 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml @@ -11,6 +11,7 @@ runtime_env: VLLM_USE_V1: "1" engine_kwargs: + max_model_len: 8192 # Need eager mode to suppress https://github.com/vllm-project/vllm/issues/18244 enforce_eager: True data_parallel_size: 1 diff --git a/release/llm_tests/serve/probes/test_exact_correctness.py b/release/llm_tests/serve/probes/test_exact_correctness.py index 82552ab6c9d..453aa5ab49c 100644 --- a/release/llm_tests/serve/probes/test_exact_correctness.py +++ b/release/llm_tests/serve/probes/test_exact_correctness.py @@ -18,10 +18,12 @@ def deterministic_querier(openai_async_client): HELLO_WORLD_RESPONSES_BY_MODEL = { "default": ("Hello world.", "'Hello world.'"), + "Qwen3-0.6B": ("\nOkay, the user wants me",), } COUNTING_PATTERN_RESPONSES_BY_MODEL = { - "default": ["Five", "five", "Five.", "five."], + "default": ("Five", "five", "Five.", "five."), + "Qwen3-0.6B": ("\nOkay, the user provided the pattern",), } From 5f055e29583998307762cd4d97bf0eb6b24f8e29 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 10:35:48 -0700 Subject: [PATCH 11/15] use a10g Signed-off-by: Linkun Chen --- .../serve/configs/model_config/qwen_3_0dot6B_1replica.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml index b8ec1015533..86ef4cc8e18 100644 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml @@ -3,7 +3,7 @@ model_loading_config: model_source: bucket_uri: s3://air-example-data/rayllm-ossci/qwen3-0.6b -accelerator_type: L40S +accelerator_type: A10G # Test V1 at the same time runtime_env: From f68cfdc02730410cc7dff1d5a9c999484fee4aed Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 10:48:24 -0700 Subject: [PATCH 12/15] fix test Signed-off-by: Linkun Chen --- release/llm_tests/serve/probes/test_basic.py | 12 +++++++++--- .../llm_tests/serve/probes/test_exact_correctness.py | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/release/llm_tests/serve/probes/test_basic.py b/release/llm_tests/serve/probes/test_basic.py index 0c35adaeba0..6c7d7faf989 100755 --- a/release/llm_tests/serve/probes/test_basic.py +++ b/release/llm_tests/serve/probes/test_basic.py @@ -2,6 +2,7 @@ import asyncio import base64 import os +import re import time import openai @@ -308,11 +309,16 @@ async def test_logprobs( for resp in response: running_str = "" for logprob in resp["logprobs"]["content"]: + logprob_token = logprob["token"] assert len(logprob["top_logprobs"]) == num_logprobs - assert list(logprob["token"].encode()) == logprob["bytes"] + assert list(logprob_token.encode()) == logprob["bytes"] # Special tokens that will not be a part of the response content - if logprob["token"] not in ("", "<|eot_id|>"): - running_str += logprob["token"] + if logprob_token in ("", "<|eot_id|>"): + continue + # Replace non-ASCII tokens, like Ċ, Ġ, etc. with a space + # TODO(lk-chen): Figure out why there are non-ASCII tokens in the response + logprob_token = re.sub(r"[^\x00-\x7F]", " ", logprob_token) + running_str += logprob_token assert running_str == resp["message"]["content"] # top logprobs have to be between 0 and 5 diff --git a/release/llm_tests/serve/probes/test_exact_correctness.py b/release/llm_tests/serve/probes/test_exact_correctness.py index 453aa5ab49c..1eda02374e4 100644 --- a/release/llm_tests/serve/probes/test_exact_correctness.py +++ b/release/llm_tests/serve/probes/test_exact_correctness.py @@ -23,7 +23,10 @@ def deterministic_querier(openai_async_client): COUNTING_PATTERN_RESPONSES_BY_MODEL = { "default": ("Five", "five", "Five.", "five."), - "Qwen3-0.6B": ("\nOkay, the user provided the pattern",), + "Qwen3-0.6B": ( + "\nOkay, the user provided the pattern", + "\nOkay, the user provided the", + ), } From af51afda0c24b42287b2aa76ccc03fd5a7e58b67 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 11:31:02 -0700 Subject: [PATCH 13/15] fix Signed-off-by: Linkun Chen --- release/llm_tests/serve/probes/test_basic.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/release/llm_tests/serve/probes/test_basic.py b/release/llm_tests/serve/probes/test_basic.py index 6c7d7faf989..7c3aeb70c86 100755 --- a/release/llm_tests/serve/probes/test_basic.py +++ b/release/llm_tests/serve/probes/test_basic.py @@ -313,10 +313,12 @@ async def test_logprobs( assert len(logprob["top_logprobs"]) == num_logprobs assert list(logprob_token.encode()) == logprob["bytes"] # Special tokens that will not be a part of the response content - if logprob_token in ("", "<|eot_id|>"): + if logprob_token in ("", "<|eot_id|>", "<|im_end|>"): continue - # Replace non-ASCII tokens, like Ċ, Ġ, etc. with a space - # TODO(lk-chen): Figure out why there are non-ASCII tokens in the response + # Replace non-ASCII tokens, like Ċ, Ġ, etc. with desired replacement + # TODO(lk-chen): This is hacking tokenizer, figure out how to properly + # handle this + logprob_token = logprob_token.replace("Ċ", "\n") logprob_token = re.sub(r"[^\x00-\x7F]", " ", logprob_token) running_str += logprob_token assert running_str == resp["message"]["content"] From 2dc4db447395625d2a388265b462b37ce8bee580 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 13:02:36 -0700 Subject: [PATCH 14/15] no qwen, revert common Signed-off-by: Linkun Chen --- release/llm_tests/serve/benchmark/common.py | 17 ++++----------- .../model_config/qwen_3_0dot6B_1replica.yaml | 21 ------------------- ...ve_llama_3dot1_8b_quantized_tp1_1p1d.yaml} | 4 ++-- .../serve/run_llm_serve_test_and_bms.py | 2 +- release/release_tests.yaml | 2 +- 5 files changed, 8 insertions(+), 38 deletions(-) delete mode 100644 release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml rename release/llm_tests/serve/configs/{serve_qwen3_0dot6B_1p1d.yaml => serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml} (52%) diff --git a/release/llm_tests/serve/benchmark/common.py b/release/llm_tests/serve/benchmark/common.py index fb42bc08201..3c1bac146d0 100644 --- a/release/llm_tests/serve/benchmark/common.py +++ b/release/llm_tests/serve/benchmark/common.py @@ -3,7 +3,7 @@ import json import logging import os -from typing import Any, Dict, List +from typing import Dict, List, Any from urllib.parse import urlparse import boto3 @@ -17,22 +17,13 @@ ) -def get_llm_config(serve_config_file: str) -> Dict[str, Any]: +def get_llm_config(serve_config_file: List[Dict]) -> List[Any]: """Get the first llm_config from serve config file.""" with open(serve_config_file, "r") as f: loaded_llm_config = yaml.safe_load(f) - application = loaded_llm_config["applications"][0] - assert ( - "args" in application - ), f"Application must contain an 'args' key, got {application}" - if "llm_configs" in application["args"]: - config = application["args"]["llm_configs"][0] - elif "prefill_config" in application["args"]: - config = application["args"]["prefill_config"] - else: - raise ValueError(f"Unrecognized serve config schema: {application['args']}") - + applications = loaded_llm_config["applications"] + config = applications[0]["args"]["llm_configs"][0] if isinstance(config, dict): return config diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml deleted file mode 100644 index 86ef4cc8e18..00000000000 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ /dev/null @@ -1,21 +0,0 @@ -model_loading_config: - model_id: Qwen3-0.6B - model_source: - bucket_uri: s3://air-example-data/rayllm-ossci/qwen3-0.6b - -accelerator_type: A10G - -# Test V1 at the same time -runtime_env: - env_vars: - VLLM_USE_V1: "1" - -engine_kwargs: - max_model_len: 8192 - # Need eager mode to suppress https://github.com/vllm-project/vllm/issues/18244 - enforce_eager: True - data_parallel_size: 1 - tensor_parallel_size: 1 - -deployment_config: - num_replicas: 1 diff --git a/release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml similarity index 52% rename from release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml rename to release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml index 456fa9b5ac0..20d309a3d2a 100644 --- a/release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml +++ b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml @@ -1,7 +1,7 @@ applications: - args: - prefill_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml - decode_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml + prefill_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml + decode_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml import_path: ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg:build_app name: llm-endpoint route_prefix: / diff --git a/release/llm_tests/serve/run_llm_serve_test_and_bms.py b/release/llm_tests/serve/run_llm_serve_test_and_bms.py index 425a444b705..22ff789b143 100644 --- a/release/llm_tests/serve/run_llm_serve_test_and_bms.py +++ b/release/llm_tests/serve/run_llm_serve_test_and_bms.py @@ -100,7 +100,6 @@ def main( env_vars = get_hf_token_env_var() if not skip_hf_token else {} vllm_use_v1_env = "1" if vllm_use_v1 else "0" env_vars["VLLM_USE_V1"] = vllm_use_v1_env - llm_config = get_llm_config(serve_config_file) if run_vllm_profiler: @@ -149,6 +148,7 @@ def main( raise RuntimeError(f"Tests failed! {exit_code=}") if run_serve_llm_profiler: + llm_config = get_llm_config(serve_config_file) # For now, the values are hardcoded. results = run_bm( api_url=api_url, diff --git a/release/release_tests.yaml b/release/release_tests.yaml index ebea7087598..fe033324a44 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4324,7 +4324,7 @@ run: timeout: 3600 long_running: false - script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_qwen3_0dot6B_1p1d.yaml --skip-hf-token true + script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml --skip-hf-token true ############## From 540d793942ba6ff4825e8163b0d75f9777e54bf8 Mon Sep 17 00:00:00 2001 From: Linkun Chen Date: Wed, 28 May 2025 14:51:28 -0700 Subject: [PATCH 15/15] rebase Signed-off-by: Linkun Chen --- docker/ray-llm/Dockerfile | 2 -- release/llm_tests/serve/probes/test_basic.py | 14 +++----------- .../serve/probes/test_exact_correctness.py | 5 ----- release/release_tests.yaml | 2 +- 4 files changed, 4 insertions(+), 19 deletions(-) diff --git a/docker/ray-llm/Dockerfile b/docker/ray-llm/Dockerfile index a3014d69765..eaf155da7ac 100644 --- a/docker/ray-llm/Dockerfile +++ b/docker/ray-llm/Dockerfile @@ -75,8 +75,6 @@ sudo apt-get install -y kmod pkg-config librdmacm-dev ( echo "Installing UCX" - # Needed by UCX - sudo apt-get install -y librdmacm-dev cd "${TEMP_DIR}" wget "https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz" -q tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz diff --git a/release/llm_tests/serve/probes/test_basic.py b/release/llm_tests/serve/probes/test_basic.py index 7c3aeb70c86..0c35adaeba0 100755 --- a/release/llm_tests/serve/probes/test_basic.py +++ b/release/llm_tests/serve/probes/test_basic.py @@ -2,7 +2,6 @@ import asyncio import base64 import os -import re import time import openai @@ -309,18 +308,11 @@ async def test_logprobs( for resp in response: running_str = "" for logprob in resp["logprobs"]["content"]: - logprob_token = logprob["token"] assert len(logprob["top_logprobs"]) == num_logprobs - assert list(logprob_token.encode()) == logprob["bytes"] + assert list(logprob["token"].encode()) == logprob["bytes"] # Special tokens that will not be a part of the response content - if logprob_token in ("", "<|eot_id|>", "<|im_end|>"): - continue - # Replace non-ASCII tokens, like Ċ, Ġ, etc. with desired replacement - # TODO(lk-chen): This is hacking tokenizer, figure out how to properly - # handle this - logprob_token = logprob_token.replace("Ċ", "\n") - logprob_token = re.sub(r"[^\x00-\x7F]", " ", logprob_token) - running_str += logprob_token + if logprob["token"] not in ("", "<|eot_id|>"): + running_str += logprob["token"] assert running_str == resp["message"]["content"] # top logprobs have to be between 0 and 5 diff --git a/release/llm_tests/serve/probes/test_exact_correctness.py b/release/llm_tests/serve/probes/test_exact_correctness.py index 1eda02374e4..6a9495f932c 100644 --- a/release/llm_tests/serve/probes/test_exact_correctness.py +++ b/release/llm_tests/serve/probes/test_exact_correctness.py @@ -18,15 +18,10 @@ def deterministic_querier(openai_async_client): HELLO_WORLD_RESPONSES_BY_MODEL = { "default": ("Hello world.", "'Hello world.'"), - "Qwen3-0.6B": ("\nOkay, the user wants me",), } COUNTING_PATTERN_RESPONSES_BY_MODEL = { "default": ("Five", "five", "Five.", "five."), - "Qwen3-0.6B": ( - "\nOkay, the user provided the pattern", - "\nOkay, the user provided the", - ), } diff --git a/release/release_tests.yaml b/release/release_tests.yaml index fe033324a44..1bc5e62fc83 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4304,7 +4304,7 @@ long_running: false script: pytest -vs test_llm_serve_integration.py -- name: llm_serve_qwen_3_8B_fp8_1p1d +- name: llm_serve_llama_3dot1_8B_quantized_tp1_1p1d frequency: nightly python: "3.11" group: llm-serve