Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 149 additions & 50 deletions .github/workflows/pr-vllm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,82 @@ jobs:
vllm-rayserve-ec2:
- "docker/vllm/Dockerfile.rayserve"

# test upstream image
vllm-upstream-sagemaker_standards-test:
needs: [check-changes]
runs-on:
- codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }}
fleet:x86-g6xl-runner
steps:
- name: Checkout DLC source
uses: actions/checkout@v5

- name: Pull image
run: |
docker pull docker.io/vllm/vllm-openai:v0.11.2

- name: Checkout vLLM Tests
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v0.11.2
path: vllm_source

- name: Start container
run: |
CONTAINER_ID=$(docker run -d -it --rm --gpus=all --entrypoint /bin/bash \
-v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-v ${HOME}/.cache/vllm:/root/.cache/vllm \
-v ./vllm_source:/workdir --workdir /workdir \
-e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} \
docker.io/vllm/vllm-openai:v0.11.2)
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV

- name: Setup for vLLM Test
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'

- name: Run vLLM Tests
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
nvidia-smi

pip list | grep model-hosting-container-standards

# Test LoRA adapter loading/unloading via SageMaker endpoints
pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v

# Test stateful session management
pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v

# Test sagemaker custom middleware
pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v

# Test sagemaker endpoint overrides
pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v

# Test LoRA adapter loading/unloading via original OpenAI API server endpoints
pytest tests/entrypoints/openai/test_lora_adapters.py -v
'

- name: Cleanup container and images
if: always()
run: |
docker rm -f ${CONTAINER_ID} || true
docker image prune -a --force --filter "until=24h"
docker system df


# vLLM jobs
build-vllm-image:
needs: [check-changes]
Expand All @@ -61,7 +137,7 @@ jobs:
- name: Resolve image URI for build
id: image-uri-build
run: |
IMAGE_URI=${{ vars.AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-0.11.0-gpu-py312-cu128-ubuntu22.04-ec2-pr-${{ github.event.pull_request.number }}
IMAGE_URI=${{ vars.AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_REGION }}.amazonaws.com/ci:vllm-0.11.2-gpu-py312-cu128-ubuntu22.04-ec2-pr-${{ github.event.pull_request.number }}
echo "Image URI to build: ${IMAGE_URI}"
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_ENV}
echo "IMAGE_URI=${IMAGE_URI}" >> ${GITHUB_OUTPUT}
Expand Down Expand Up @@ -102,7 +178,7 @@ jobs:
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v0.11.0
ref: v0.11.2
path: vllm_source

- name: Start container
Expand All @@ -119,10 +195,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand All @@ -145,7 +222,7 @@ jobs:
with:
container_id: ${{ env.CONTAINER_ID }}

vllm-cuda-test:
vllm-cuda-and-example-test:
needs: [build-vllm-image]
if: needs.build-vllm-image.result == 'success'
runs-on:
Expand All @@ -166,7 +243,7 @@ jobs:
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v0.11.0
ref: v0.11.2
path: vllm_source

- name: Start container
Expand All @@ -183,10 +260,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand All @@ -200,6 +278,25 @@ jobs:
# Platform Tests (CUDA) # 4min
cd /workdir/tests
pytest -v -s cuda/test_cuda_context.py

# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
SAGEMAKER_CONTAINER_LOG_LEVEL=20 python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
SAGEMAKER_CONTAINER_LOG_LEVEL=INFO python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
python3 offline_inference/audio_language.py --seed 0
python3 offline_inference/vision_language.py --seed 0
python3 offline_inference/vision_language_pooling.py --seed 0
python3 offline_inference/vision_language_multi_image.py --seed 0
VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
python3 offline_inference/basic/classify.py
python3 offline_inference/basic/embed.py
python3 offline_inference/basic/score.py
python3 offline_inference/simple_profiling.py
'

- name: Cleanup container and images
Expand All @@ -208,7 +305,7 @@ jobs:
with:
container_id: ${{ env.CONTAINER_ID }}

vllm-example-test:
vllm-sagemaker_standards-test:
needs: [build-vllm-image]
if: needs.build-vllm-image.result == 'success'
runs-on:
Expand All @@ -229,7 +326,7 @@ jobs:
uses: actions/checkout@v5
with:
repository: vllm-project/vllm
ref: v0.11.0
ref: v0.11.2
path: vllm_source

- name: Start container
Expand All @@ -246,10 +343,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand All @@ -260,24 +358,22 @@ jobs:
set -eux
nvidia-smi

# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
python3 offline_inference/audio_language.py --seed 0
python3 offline_inference/vision_language.py --seed 0
python3 offline_inference/vision_language_pooling.py --seed 0
python3 offline_inference/vision_language_multi_image.py --seed 0
VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
python3 offline_inference/basic/classify.py
python3 offline_inference/basic/embed.py
python3 offline_inference/basic/score.py
python3 offline_inference/simple_profiling.py
pip list | grep model-hosting-container-standards

# Test LoRA adapter loading/unloading via SageMaker endpoints
pytest tests/entrypoints/sagemaker/test_sagemaker_lora_adapters.py -v

# Test stateful session management
pytest tests/entrypoints/sagemaker/test_sagemaker_stateful_sessions.py -v

# Test sagemaker custom middleware
pytest tests/entrypoints/sagemaker/test_sagemaker_middleware_integration.py -v

# Test sagemaker endpoint overrides
pytest tests/entrypoints/sagemaker/test_sagemaker_handler_overrides.py -v

# Test LoRA adapter loading/unloading via original OpenAI API server endpoints
pytest tests/entrypoints/openai/test_lora_adapters.py -v
'

- name: Cleanup container and images
Expand Down Expand Up @@ -364,10 +460,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand Down Expand Up @@ -428,10 +525,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand Down Expand Up @@ -491,10 +589,11 @@ jobs:
run: |
docker exec ${CONTAINER_ID} sh -c '
set -eux
uv pip install --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --system pytest pytest-asyncio
uv pip install --system -e tests/vllm_test_utils
uv pip install --system hf_transfer
uv pip install --no-upgrade --system -r requirements/common.txt -r requirements/dev.txt --torch-backend=auto
uv pip install --no-upgrade --system pytest pytest-asyncio
uv pip install --no-upgrade --system -e tests/vllm_test_utils
uv pip install --no-upgrade --system hf_transfer
uv pip install --system model-hosting-container-standards==0.1.9
mkdir src
mv vllm src/vllm
'
Expand All @@ -508,9 +607,9 @@ jobs:
# Examples Test # 30min
cd /workdir/examples
pip install tensorizer # for tensorizer test
python3 offline_inference/basic/generate.py --model facebook/opt-125m
SAGEMAKER_CONTAINER_LOG_LEVEL=20 python3 offline_inference/basic/generate.py --model facebook/opt-125m
# python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
python3 offline_inference/basic/chat.py
SAGEMAKER_CONTAINER_LOG_LEVEL=INFO python3 offline_inference/basic/chat.py
python3 offline_inference/prefix_caching.py
python3 offline_inference/llm_engine_example.py
python3 offline_inference/audio_language.py --seed 0
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
TO be updated later
TBD
10 changes: 7 additions & 3 deletions docker/vllm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
FROM docker.io/vllm/vllm-openai:v0.11.0 as base
ARG PYTHON="python3"
FROM docker.io/vllm/vllm-openai:v0.11.2 as base
LABEL maintainer="Amazon AI"
ARG EFA_VERSION="1.43.3"
LABEL dlc_major_version="1"
ARG PYTHON="python3"
ARG EFA_VERSION="1.43.3"
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
LC_ALL=C.UTF-8 \
Expand Down Expand Up @@ -59,6 +59,10 @@ RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark ho
&& apt-get upgrade -y \
&& apt-get clean

RUN uv pip install --system model-hosting-container-standards==0.1.9 \
&& uv pip freeze | grep model-hosting-container-standards | grep "0.1.9" \
&& uv cache clean

COPY ./scripts/vllm/dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh

Expand Down