|
| 1 | +FROM lmsysorg/sglang:v0.5.5-cu129-amd64 AS base |
| 2 | + |
| 3 | +# ==================================================== |
| 4 | +# ====================== common ====================== |
| 5 | +# ==================================================== |
| 6 | + |
| 7 | +ARG PYTHON="python3" |
| 8 | +ARG EFA_VERSION="1.43.3" |
| 9 | + |
| 10 | +LABEL maintainer="Amazon AI" |
| 11 | +LABEL dlc_major_version="1" |
| 12 | + |
| 13 | +ENV DEBIAN_FRONTEND=noninteractive \ |
| 14 | + LANG=C.UTF-8 \ |
| 15 | + LC_ALL=C.UTF-8 \ |
| 16 | + DLC_CONTAINER_TYPE=general \ |
| 17 | + # Python won’t try to write .pyc or .pyo files on the import of source modules |
| 18 | + # Force stdin, stdout and stderr to be totally unbuffered. Good for logging |
| 19 | + PYTHONDONTWRITEBYTECODE=1 \ |
| 20 | + PYTHONUNBUFFERED=1 \ |
| 21 | + PYTHONIOENCODING=UTF-8 \ |
| 22 | + LD_LIBRARY_PATH="/usr/local/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" \ |
| 23 | + PATH="/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/cuda/bin:${PATH}" |
| 24 | + |
| 25 | +WORKDIR / |
| 26 | + |
| 27 | +# Copy artifacts |
| 28 | +# =============== |
| 29 | +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py |
| 30 | +COPY bash_telemetry.sh /usr/local/bin/bash_telemetry.sh |
| 31 | +COPY install_efa.sh install_efa.sh |
| 32 | +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh |
| 33 | + |
| 34 | +RUN chmod +x /usr/local/bin/deep_learning_container.py \ |
| 35 | + && chmod +x /usr/local/bin/bash_telemetry.sh \ |
| 36 | + && chmod +x /usr/local/bin/start_cuda_compat.sh |
| 37 | + |
| 38 | +# Install cuda compat |
| 39 | +# ==================== |
| 40 | +# RUN apt-get update \ |
| 41 | +# && apt-get -y upgrade --only-upgrade systemd \ |
| 42 | +# && apt-get install -y --allow-change-held-packages --no-install-recommends \ |
| 43 | +# cuda-compat-12-9 \ |
| 44 | +# && rm -rf /var/lib/apt/lists/* \ |
| 45 | +# && apt-get clean |
| 46 | + |
| 47 | +# Install EFA and remove vulnerable nvjpeg |
| 48 | +# ========================================= |
| 49 | +RUN bash install_efa.sh ${EFA_VERSION} \ |
| 50 | + && rm install_efa.sh \ |
| 51 | + && mkdir -p /tmp/nvjpeg \ |
| 52 | + && cd /tmp/nvjpeg \ |
| 53 | + # latest cu12 libnvjpeg available is cu124 |
| 54 | + && wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ |
| 55 | + && tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ |
| 56 | + && rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ |
| 57 | + && rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ |
| 58 | + && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ |
| 59 | + && cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ |
| 60 | + && rm -rf /tmp/nvjpeg \ |
| 61 | + # create symlink for python |
| 62 | + && rm -rf /usr/bin/python \ |
| 63 | + && ln -s /usr/bin/python3 /usr/bin/python \ |
| 64 | + # remove cuobjdump and nvdisasm |
| 65 | + && rm -rf /usr/local/cuda/bin/cuobjdump* \ |
| 66 | + && rm -rf /usr/local/cuda/bin/nvdisasm* |
| 67 | + |
| 68 | +# Run OSS compliance script |
| 69 | +# ========================== |
| 70 | +RUN echo 'source /usr/local/bin/bash_telemetry.sh' >> /etc/bash.bashrc \ |
| 71 | + # OSS compliance - use Python zipfile instead of unzip |
| 72 | + && HOME_DIR=/root \ |
| 73 | + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ |
| 74 | + && python3 -c "import zipfile, os; zipfile.ZipFile('/root/oss_compliance.zip').extractall('/root/'); os.remove('/root/oss_compliance.zip')" \ |
| 75 | + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ |
| 76 | + && chmod +x /usr/local/bin/testOSSCompliance \ |
| 77 | + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ |
| 78 | + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ |
| 79 | + # clean up |
| 80 | + && rm -rf ${HOME_DIR}/oss_compliance* \ |
| 81 | + && rm -rf /tmp/tmp* \ |
| 82 | + && rm -rf /tmp/uv* \ |
| 83 | + && rm -rf /var/lib/apt/lists/* \ |
| 84 | + && rm -rf /root/.cache | true |
| 85 | + |
| 86 | +# ======================================================= |
| 87 | +# ====================== sagemaker ====================== |
| 88 | +# ======================================================= |
| 89 | + |
| 90 | +FROM base AS sglang-sagemaker |
| 91 | + |
| 92 | +RUN dpkg -l | grep -E "cuda|nvidia|libnv" | awk '{print $2}' | xargs apt-mark hold \ |
| 93 | + && apt-get update \ |
| 94 | + && apt-get upgrade -y \ |
| 95 | + && apt-get clean |
| 96 | + |
| 97 | +RUN pip install --no-cache-dir -U \ |
| 98 | + boto3 |
| 99 | + |
| 100 | +RUN rm -rf /tmp/* |
| 101 | + |
| 102 | +COPY sagemaker_entrypoint.sh /usr/local/bin/sagemaker_entrypoint.sh |
| 103 | +RUN chmod +x /usr/local/bin/sagemaker_entrypoint.sh |
| 104 | + |
| 105 | +ENTRYPOINT ["/usr/local/bin/sagemaker_entrypoint.sh"] |
0 commit comments