Skip to content

Commit fd8fd7f

Browse files
authored
[SGLang][SageMaker][GPU] SGLang 0.5.5 Release (#5450)
* inital commit * update sglang container and entrypoint * add buildspec.yaml * tmp test qwen * revert vllm * fix sm path * fix sglang entrpoint * finalize dockerfile * add toml file * add get job type func * use dict job type * add sglang * fix target name * add tests to buildspec * fix test runner and get framework func * add job type * fix sanity and security tests * revert run new tests * formatting * fix jobtype func and add sglang general integration sagemaker dir * add sglang and vllm to frameworks * add skip general types * fix cuda compat and entrypoint * fix dlc container type * install boto3 * add sglang to types * sgl fix bug * add pytest * add print debug * add conftest * fix conftest * fix fixtures * printing responses * fix endpoint name * remove sm local * revert sglang
1 parent a5b4fb2 commit fd8fd7f

25 files changed

+889
-94
lines changed

dlc_developer_config.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ deep_canary_mode = false
3636

3737
[build]
3838
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
39-
# available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
39+
# available frameworks - ["base", "vllm", "sglang", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
4040
build_frameworks = []
4141

4242

@@ -58,7 +58,7 @@ notify_test_failures = false
5858
[test]
5959
# Set to true to use the new test structure path for frameworks
6060
# Off by default (set to false)
61-
use_new_test_structure = false
61+
use_new_test_structure = false
6262

6363
### On by default
6464
sanity_tests = true
@@ -90,7 +90,7 @@ enable_ipv6 = false
9090
### b. Configure the default security group to allow SSH traffic using IPv4
9191
###
9292
### 3. Create an EFA-enabled security group:
93-
### a. Follow 'Step 1: Prepare an EFA-enabled security group' in:
93+
### a. Follow 'Step 1: Prepare an EFA-enabled security group' in:
9494
### https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security
9595
### b. Configure this security group to also allow SSH traffic via IPv4
9696
ipv6_vpc_name = ""
@@ -185,3 +185,6 @@ dlc-pr-tensorflow-2-eia-inference = ""
185185

186186
# vllm
187187
dlc-pr-vllm = ""
188+
189+
# sglang
190+
dlc-pr-sglang = ""
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
# Check if telemetry file exists before executing
3+
# Execute telemetry script if it exists, suppress errors
4+
bash /usr/local/bin/bash_telemetry.sh >/dev/null 2>&1 || true
5+
6+
if command -v nvidia-smi >/dev/null 2>&1 && command -v nvcc >/dev/null 2>&1; then
7+
bash /usr/local/bin/start_cuda_compat.sh
8+
fi
9+
10+
echo "Starting server"
11+
12+
PREFIX="SM_SGLANG_"
13+
ARG_PREFIX="--"
14+
15+
ARGS=()
16+
17+
while IFS='=' read -r key value; do
18+
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
19+
20+
ARGS+=("${ARG_PREFIX}${arg_name}")
21+
if [ -n "$value" ]; then
22+
ARGS+=("$value")
23+
fi
24+
done < <(env | grep "^${PREFIX}")
25+
26+
# Add default port only if not already set
27+
if ! [[ " ${ARGS[@]} " =~ " --port " ]]; then
28+
ARGS+=(--port "${SM_SGLANG_PORT:-8080}")
29+
fi
30+
31+
# Add default host only if not already set
32+
if ! [[ " ${ARGS[@]} " =~ " --host " ]]; then
33+
ARGS+=(--host "${SM_SGLANG_HOST:-0.0.0.0}")
34+
fi
35+
36+
# Add default model-path only if not already set
37+
if ! [[ " ${ARGS[@]} " =~ " --model-path " ]]; then
38+
ARGS+=(--model-path "${SM_SGLANG_MODEL_PATH:-/opt/ml/model}")
39+
fi
40+
41+
echo "Running command: exec python3 -m sglang.launch_server ${ARGS[@]}"
42+
exec python3 -m sglang.launch_server "${ARGS[@]}"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
verlte() {
4+
[ "$1" = "$2" ] && return 1 || [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
5+
}
6+
7+
COMPAT_FILE=/usr/local/cuda/compat/libcuda.so.1
8+
if [ -f $COMPAT_FILE ]; then
9+
CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink $COMPAT_FILE | cut -d'.' -f 3-)
10+
echo "CUDA compat package should be installed for NVIDIA driver smaller than ${CUDA_COMPAT_MAX_DRIVER_VERSION}"
11+
NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
12+
if [ -z "$NVIDIA_DRIVER_VERSION" ]; then
13+
NVIDIA_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader --id=0 2>/dev/null || true)
14+
fi
15+
echo "Current installed NVIDIA driver version is ${NVIDIA_DRIVER_VERSION}"
16+
if verlte $NVIDIA_DRIVER_VERSION $CUDA_COMPAT_MAX_DRIVER_VERSION; then
17+
echo "Adding CUDA compat to LD_LIBRARY_PATH"
18+
export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
19+
echo $LD_LIBRARY_PATH
20+
else
21+
echo "Skipping CUDA compat setup as newer NVIDIA driver is installed"
22+
fi
23+
else
24+
echo "Skipping CUDA compat setup as package not found"
25+
fi

sglang/buildspec-sm.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
2+
prod_account_id: &PROD_ACCOUNT_ID 763104351884
3+
region: &REGION <set-$REGION-in-environment>
4+
framework: &FRAMEWORK sglang
5+
version: &VERSION "0.5.5"
6+
short_version: &SHORT_VERSION "0.5"
7+
arch_type: &ARCH_TYPE x86_64
8+
autopatch_build: "False"
9+
10+
repository_info:
11+
build_repository: &BUILD_REPOSITORY
12+
image_type: &IMAGE_TYPE gpu
13+
root: .
14+
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK ]
15+
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
16+
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK ]
17+
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
18+
19+
context:
20+
build_context: &BUILD_CONTEXT
21+
deep_learning_container:
22+
source: src/deep_learning_container.py
23+
target: deep_learning_container.py
24+
install_efa:
25+
source: scripts/install_efa.sh
26+
target: install_efa.sh
27+
start_cuda_compat:
28+
source: sglang/build_artifacts/start_cuda_compat.sh
29+
target: start_cuda_compat.sh
30+
sagemaker_entrypoint:
31+
source: sglang/build_artifacts/sagemaker_entrypoint.sh
32+
target: sagemaker_entrypoint.sh
33+
34+
images:
35+
sglang_sm:
36+
<<: *BUILD_REPOSITORY
37+
context:
38+
<<: *BUILD_CONTEXT
39+
image_size_baseline: 26000
40+
device_type: &DEVICE_TYPE gpu
41+
cuda_version: &CUDA_VERSION cu129
42+
python_version: &DOCKER_PYTHON_VERSION py3
43+
tag_python_version: &TAG_PYTHON_VERSION py312
44+
os_version: &OS_VERSION ubuntu22.04
45+
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
46+
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
47+
docker_file: !join [ *FRAMEWORK, /, *ARCH_TYPE, /, *DEVICE_TYPE, /Dockerfile ]
48+
target: sglang-sagemaker
49+
build: true
50+
enable_common_stage_build: false
51+
test_configs:
52+
test_platforms:
53+
- sanity
54+
- security
55+
- sagemaker

sglang/buildspec.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
buildspec_pointer: buildspec-sm.yml

sglang/x86_64/gpu/Dockerfile

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
FROM lmsysorg/sglang:v0.5.5-cu129-amd64 AS base
2+
3+
# ====================================================
4+
# ====================== common ======================
5+
# ====================================================
6+
7+
ARG PYTHON="python3"
8+
ARG EFA_VERSION="1.43.3"
9+
10+
LABEL maintainer="Amazon AI"
11+
LABEL dlc_major_version="1"
12+
13+
ENV DEBIAN_FRONTEND=noninteractive \
14+
LANG=C.UTF-8 \
15+
LC_ALL=C.UTF-8 \
16+
DLC_CONTAINER_TYPE=general \
17+
# Python won’t try to write .pyc or .pyo files on the import of source modules
18+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
19+
PYTHONDONTWRITEBYTECODE=1 \
20+
PYTHONUNBUFFERED=1 \
21+
PYTHONIOENCODING=UTF-8 \
22+
LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \
23+
PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}"
24+
25+
WORKDIR /
26+
27+
# Copy artifacts
28+
# ===============
29+
COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
30+
COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh
31+
COPY install_efa.sh install_efa.sh
32+
COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
33+
34+
RUN chmod +x /usr/local/bin/deep_learning_container.py \
35+
&& chmod +x /usr/local/bin/bash_telemetry.sh \
36+
&& chmod +x /usr/local/bin/start_cuda_compat.sh
37+
38+
# Install cuda compat
39+
# ====================
40+
# RUN apt-get update \
41+
# && apt-get -y upgrade --only-upgrade systemd \
42+
# && apt-get install -y --allow-change-held-packages --no-install-recommends \
43+
# cuda-compat-12-9 \
44+
# && rm -rf /var/lib/apt/lists/* \
45+
# && apt-get clean
46+
47+
# Install EFA and remove vulnerable nvjpeg
48+
# =========================================
49+
RUN bash install_efa.sh ${EFA_VERSION} \
50+
&& rm install_efa.sh \
51+
&& mkdir -p /tmp/nvjpeg \
52+
&& cd /tmp/nvjpeg \
53+
# latest cu12 libnvjpeg available is cu124
54+
&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
55+
&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \
56+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \
57+
&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \
58+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \
59+
&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \
60+
&& rm -rf /tmp/nvjpeg \
61+
# create symlink for python
62+
&& rm -rf /usr/bin/python \
63+
&& ln -s /usr/bin/python3 /usr/bin/python \
64+
# remove cuobjdump and nvdisasm
65+
&& rm -rf /usr/local/cuda/bin/cuobjdump* \
66+
&& rm -rf /usr/local/cuda/bin/nvdisasm*
67+
68+
# Run OSS compliance script
69+
# ==========================
70+
RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc \
71+
# OSS compliance - use Python zipfile instead of unzip
72+
&& HOME_DIR=/root \
73+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
74+
&& python3 -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')" \
75+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
76+
&& chmod +x /usr/local/bin/testOSSCompliance \
77+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
78+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
79+
# clean up
80+
&& rm -rf ${HOME_DIR}/oss_compliance* \
81+
&& rm -rf /tmp/tmp* \
82+
&& rm -rf /tmp/uv* \
83+
&& rm -rf /var/lib/apt/lists/* \
84+
&& rm -rf /root/.cache | true
85+
86+
# =======================================================
87+
# ====================== sagemaker ======================
88+
# =======================================================
89+
90+
FROM base AS sglang-sagemaker
91+
92+
RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \
93+
&& apt-get update \
94+
&& apt-get upgrade -y \
95+
&& apt-get clean
96+
97+
RUN pip install --no-cache-dir -U \
98+
boto3
99+
100+
RUN rm -rf /tmp/*
101+
102+
COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh
103+
RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh
104+
105+
ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"]

src/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
"autogluon",
2626
"stabilityai_pytorch",
2727
"base",
28+
"vllm",
29+
"sglang",
2830
}
2931
DEVICE_TYPES = {"cpu", "gpu", "hpu", "eia", "inf", "neuron", "neuronx"}
3032
IMAGE_TYPES = {"training", "inference"}

src/deep_learning_container.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def parse_args():
239239
parser = argparse.ArgumentParser()
240240
parser.add_argument(
241241
"--framework",
242-
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm"],
242+
choices=["tensorflow", "mxnet", "pytorch", "base", "vllm", "sglang"],
243243
help="framework of container image.",
244244
required=True,
245245
)

src/image_builder.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,28 +15,25 @@
1515

1616
import concurrent.futures
1717
import datetime
18+
import itertools
1819
import os
1920
import re
2021
import tempfile
21-
2222
from copy import deepcopy
2323

2424
import constants
25-
import utils
26-
import itertools
2725
import patch_helper
28-
29-
from codebuild_environment import get_codebuild_project_name, get_cloned_folder_path
30-
from config import is_build_enabled, is_autopatch_build_enabled
26+
import utils
27+
from buildspec import Buildspec
28+
from codebuild_environment import get_cloned_folder_path, get_codebuild_project_name
29+
from common_stage_image import CommonStageImage
30+
from config import is_autopatch_build_enabled, is_build_enabled
3131
from context import Context
32-
from metrics import Metrics
3332
from image import DockerImage
34-
from common_stage_image import CommonStageImage
35-
from buildspec import Buildspec
33+
from metrics import Metrics
3634
from output import OutputFormatter
3735
from utils import get_dummy_boto_client
3836

39-
4037
FORMATTER = OutputFormatter(constants.PADDING)
4138
build_context = os.getenv("BUILD_CONTEXT")
4239

@@ -241,17 +238,7 @@ def image_builder(buildspec, image_types=[], device_types=[]):
241238
)
242239
# Determine job_type (inference, training, or base) based on the image repository URI.
243240
# This is used to set the job_type label on the container image.
244-
if "training" in image_repo_uri:
245-
label_job_type = "training"
246-
elif "inference" in image_repo_uri:
247-
label_job_type = "inference"
248-
elif "base" in image_repo_uri or "vllm" in image_repo_uri:
249-
label_job_type = "general"
250-
else:
251-
raise RuntimeError(
252-
f"Cannot find inference, training or base job type in {image_repo_uri}. "
253-
f"This is required to set job_type label."
254-
)
241+
label_job_type = get_job_type(image_repo_uri)
255242

256243
bash_template_file = os.path.join(
257244
os.sep, get_cloned_folder_path(), "miscellaneous_scripts", "bash_telemetry.sh"
@@ -690,3 +677,22 @@ def modify_repository_name_for_context(image_repo_uri, build_context):
690677
constants.PR_REPO_PREFIX, constants.NIGHTLY_REPO_PREFIX
691678
)
692679
return "/".join(repo_uri_values)
680+
681+
682+
def get_job_type(image_repo_uri):
683+
job_type_mapping = {
684+
"training": "training",
685+
"inference": "inference",
686+
"base": "general",
687+
"vllm": "general",
688+
"sglang": "general",
689+
}
690+
691+
for key, job_type in job_type_mapping.items():
692+
if key in image_repo_uri:
693+
return job_type
694+
695+
raise RuntimeError(
696+
f"Cannot determine job type from {image_repo_uri}. "
697+
f"Expected one of: {', '.join(job_type_mapping.keys())}"
698+
)

test/dlc_tests/sanity/test_anaconda.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ def test_repo_anaconda_not_present(image):
1515

1616
# First check to see if image has conda installed, if not, skip test since no packages installed from conda present
1717
conda_present = test_utils.run_cmd_on_container(
18-
container_name, ctx, 'find . -name conda -not -path "**/.github/*" -ignore_readdir_race'
18+
container_name,
19+
ctx,
20+
'find . -name conda -not -path "**/.github/*" -not -path "**/.oh-my-zsh/*" -ignore_readdir_race',
1921
).stdout.strip()
2022
if not conda_present:
2123
pytest.skip(f"Image {image} does not have conda installed, skipping test.")

0 commit comments

Comments
 (0)