diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 57a6915deecb..fa654e823abb 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -213,12 +213,12 @@ def __init__( # We need to overwrite the engine_id to make it unique across replicas. # "engine_id" is added in vllm 0.9.0, so do existance check. - if "engine_id" in kv_transfer_config.model_fields: - engine_id = getattr(kv_transfer_config, "engine_id", uuid.uuid4()) - host = vllm.envs.NIXL_SIDE_CHANNEL_HOST - port = vllm.envs.NIXL_SIDE_CHANNEL_PORT - kv_transfer_config.engine_id = "-".join([engine_id, host, port]) - else: + try: + engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4())) + host = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost") + port = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_PORT", 5557) + kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)]) + except ValueError: # TODO(lk-chen): Raise error once vllm 0.9.0 is pinned to rayllm logger.warning( "engine_id is not supported in vllm < 0.9.0, NIXL-backed kv transfer " diff --git a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml b/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml deleted file mode 100644 index 2474aa83963f..000000000000 --- a/release/llm_tests/serve/configs/model_config/qwen_3_0dot6B_1replica.yaml +++ /dev/null @@ -1,19 +0,0 @@ -model_loading_config: - model_id: Qwen/Qwen3-0.6B - model_source: Qwen/Qwen3-0.6B - -accelerator_type: L40S - -# Test V1 at the same time -runtime_env: - env_vars: - VLLM_USE_V1: "1" - -engine_kwargs: - # Need eager mode to suppress https://github.com/vllm-project/vllm/issues/18244 - enforce_eager: True - data_parallel_size: 1 - tensor_parallel_size: 1 - -deployment_config: - num_replicas: 1 diff --git a/release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml similarity index 52% rename from release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml rename to release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml index 456fa9b5ac09..20d309a3d2a0 100644 --- a/release/llm_tests/serve/configs/serve_qwen3_0dot6B_1p1d.yaml +++ b/release/llm_tests/serve/configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml @@ -1,7 +1,7 @@ applications: - args: - prefill_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml - decode_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml + prefill_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml + decode_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml import_path: ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg:build_app name: llm-endpoint route_prefix: / diff --git a/release/llm_tests/serve/probes/test_exact_correctness.py b/release/llm_tests/serve/probes/test_exact_correctness.py index 82552ab6c9d3..6a9495f932cd 100644 --- a/release/llm_tests/serve/probes/test_exact_correctness.py +++ b/release/llm_tests/serve/probes/test_exact_correctness.py @@ -21,7 +21,7 @@ def deterministic_querier(openai_async_client): } COUNTING_PATTERN_RESPONSES_BY_MODEL = { - "default": ["Five", "five", "Five.", "five."], + "default": ("Five", "five", "Five.", "five."), } diff --git a/release/llm_tests/serve/run_llm_serve_test_and_bms.py b/release/llm_tests/serve/run_llm_serve_test_and_bms.py index 425a444b7054..22ff789b1439 100644 --- a/release/llm_tests/serve/run_llm_serve_test_and_bms.py +++ b/release/llm_tests/serve/run_llm_serve_test_and_bms.py @@ -100,7 +100,6 @@ def main( env_vars = get_hf_token_env_var() if not skip_hf_token else {} vllm_use_v1_env = "1" if vllm_use_v1 else "0" env_vars["VLLM_USE_V1"] = vllm_use_v1_env - llm_config = get_llm_config(serve_config_file) if run_vllm_profiler: @@ -149,6 +148,7 @@ def main( raise RuntimeError(f"Tests failed! {exit_code=}") if run_serve_llm_profiler: + llm_config = get_llm_config(serve_config_file) # For now, the values are hardcoded. results = run_bm( api_url=api_url, diff --git a/release/ray_release/byod/byod_llm_pd_disagg_test.sh b/release/ray_release/byod/byod_llm_pd_disagg_test.sh new file mode 100755 index 000000000000..708fafdf7dd5 --- /dev/null +++ b/release/ray_release/byod/byod_llm_pd_disagg_test.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# This script is used to build an extra layer on top of the base llm image +# to install vllm at specific version that includes necessary changes for +# PD-disaggregated serving. + +set -exo pipefail + +# https://github.com/vllm-project/vllm/pull/17751 (Nixl Integration. May 12) +pip3 install --no-cache-dir \ + "vllm@https://wheels.vllm.ai/d19110204c03e9b77ed957fc70c1262ff370f5e2/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 8296c3c3117d..1bc5e62fc834 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4304,6 +4304,28 @@ long_running: false script: pytest -vs test_llm_serve_integration.py +- name: llm_serve_llama_3dot1_8B_quantized_tp1_1p1d + frequency: nightly + python: "3.11" + group: llm-serve + team: llm + working_dir: llm_tests/serve + + cluster: + byod: + type: llm-cu124 + # TODO(lk-chen): remove once we bump vllm to 0.9.0 + post_build_script: byod_llm_pd_disagg_test.sh + cluster_compute: llm_auto_select_worker.yaml + # NOTE: Important for getting the correct secrets + cloud_id: cld_wy5a6nhazplvu32526ams61d98 + project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq + + run: + timeout: 3600 + long_running: false + script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml --skip-hf-token true + ############## # LLM Batch