Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,12 @@ def __init__(

# We need to overwrite the engine_id to make it unique across replicas.
# "engine_id" is added in vllm 0.9.0, so do existance check.
if "engine_id" in kv_transfer_config.model_fields:
engine_id = getattr(kv_transfer_config, "engine_id", uuid.uuid4())
host = vllm.envs.NIXL_SIDE_CHANNEL_HOST
port = vllm.envs.NIXL_SIDE_CHANNEL_PORT
kv_transfer_config.engine_id = "-".join([engine_id, host, port])
else:
try:
engine_id = getattr(kv_transfer_config, "engine_id", str(uuid.uuid4()))
host = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_HOST", "localhost")
port = getattr(vllm.envs, "VLLM_NIXL_SIDE_CHANNEL_PORT", 5557)
kv_transfer_config.engine_id = "-".join([engine_id, host, str(port)])
except ValueError:
# TODO(lk-chen): Raise error once vllm 0.9.0 is pinned to rayllm
logger.warning(
"engine_id is not supported in vllm < 0.9.0, NIXL-backed kv transfer "
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
applications:
- args:
prefill_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml
decode_config: ./configs/model_config/qwen_3_0dot6B_1replica.yaml
prefill_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml
decode_config: ./configs/model_config/llama_3dot1_8b_quantized_tp1.yaml
import_path: ray.llm._internal.serve.deployments.prefill_decode_disagg.prefill_decode_disagg:build_app
name: llm-endpoint
route_prefix: /
2 changes: 1 addition & 1 deletion release/llm_tests/serve/probes/test_exact_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def deterministic_querier(openai_async_client):
}

COUNTING_PATTERN_RESPONSES_BY_MODEL = {
"default": ["Five", "five", "Five.", "five."],
"default": ("Five", "five", "Five.", "five."),
}


Expand Down
2 changes: 1 addition & 1 deletion release/llm_tests/serve/run_llm_serve_test_and_bms.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ def main(
env_vars = get_hf_token_env_var() if not skip_hf_token else {}
vllm_use_v1_env = "1" if vllm_use_v1 else "0"
env_vars["VLLM_USE_V1"] = vllm_use_v1_env
llm_config = get_llm_config(serve_config_file)

if run_vllm_profiler:

Expand Down Expand Up @@ -149,6 +148,7 @@ def main(
raise RuntimeError(f"Tests failed! {exit_code=}")

if run_serve_llm_profiler:
llm_config = get_llm_config(serve_config_file)
# For now, the values are hardcoded.
results = run_bm(
api_url=api_url,
Expand Down
10 changes: 10 additions & 0 deletions release/ray_release/byod/byod_llm_pd_disagg_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base llm image
# to install vllm at specific version that includes necessary changes for
# PD-disaggregated serving.

set -exo pipefail

# https://github.com/vllm-project/vllm/pull/17751 (Nixl Integration. May 12)
pip3 install --no-cache-dir \
"vllm@https://wheels.vllm.ai/d19110204c03e9b77ed957fc70c1262ff370f5e2/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
22 changes: 22 additions & 0 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4304,6 +4304,28 @@
long_running: false
script: pytest -vs test_llm_serve_integration.py

- name: llm_serve_llama_3dot1_8B_quantized_tp1_1p1d
frequency: nightly
python: "3.11"
group: llm-serve
team: llm
working_dir: llm_tests/serve

cluster:
byod:
type: llm-cu124
# TODO(lk-chen): remove once we bump vllm to 0.9.0
post_build_script: byod_llm_pd_disagg_test.sh
cluster_compute: llm_auto_select_worker.yaml
# NOTE: Important for getting the correct secrets
cloud_id: cld_wy5a6nhazplvu32526ams61d98
project_id: prj_lhlrf1u5yv8qz9qg3xzw8fkiiq

run:
timeout: 3600
long_running: false
script: python run_llm_serve_test_and_bms.py --serve-config-file configs/serve_llama_3dot1_8b_quantized_tp1_1p1d.yaml --skip-hf-token true


##############
# LLM Batch
Expand Down