yucai-intel · yucai-intel · Apr 1, 2025 · Apr 2, 2025 · Apr 2, 2025 · Apr 1, 2025
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -136,6 +136,9 @@ def complete_wheel(folder: str) -> str:
     """
     wheel_name = list_dir(f"/{folder}/dist")[0]
 
+    # Please note for cuda we don't run auditwheel since we use custom script to package
+    # the cuda dependencies to the wheel file using update_wheel() method.
+    # However we need to make sure filename reflects the correct Manylinux platform.
     if "pytorch" in folder and not enable_cuda:
         print("Repairing Wheel with AuditWheel")
         check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
@@ -147,7 +150,14 @@ def complete_wheel(folder: str) -> str:
             f"/{folder}/dist/{repaired_wheel_name}",
         )
     else:
-        repaired_wheel_name = wheel_name
+        repaired_wheel_name = wheel_name.replace(
+            "linux_aarch64", "manylinux_2_28_aarch64"
+        )
+        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
+        os.rename(
+            f"/{folder}/dist/{wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
 
     print(f"Copying {repaired_wheel_name} to artifacts")
     shutil.copy2(

diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
@@ -44,6 +44,8 @@ FROM base as cuda
 ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -460,10 +460,18 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
   fi
 fi
 
+no_cache_flag=""
+progress_flag=""
+# Do not use cache and progress=plain when in CI
+if [[ -n "${CI:-}" ]]; then
+  no_cache_flag="--no-cache"
+  progress_flag="--progress=plain"
+fi
+
 # Build image
 docker build \
-       --no-cache \
-       --progress=plain \
+       ${no_cache_flag} \
+       ${progress_flag} \
        --build-arg "BUILD_ENVIRONMENT=${image}" \
        --build-arg "PROTOBUF=${PROTOBUF:-}" \
        --build-arg "LLVMDEV=${LLVMDEV:-}" \

diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-cedf52aa8e4df879886270a5920da6fe84cbaa67
+7e487c24e1c20c3f4606c2d8aca2778873b00b4c
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
@@ -2,7 +2,6 @@
 
 set -ex
 
-NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.5.1.17
 
 function install_cusparselt_040 {
@@ -40,8 +39,7 @@ function install_cusparselt_063 {
 
 function install_118 {
     CUDNN_VERSION=9.1.0.70
-    NCCL_VERSION=v2.21.5-1
-    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
     rm -rf /usr/local/cuda-11.8 /usr/local/cuda
     # install CUDA 11.8.0 in the same container
     wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -59,14 +57,7 @@ function install_118 {
     cd ..
     rm -rf tmp_cudnn
 
-    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-    cd nccl && make -j src.build
-    cp -a build/include/* /usr/local/cuda/include/
-    cp -a build/lib/* /usr/local/cuda/lib64/
-    cd ..
-    rm -rf nccl
+    CUDA_VERSION=11.8 bash install_nccl.sh
 
     install_cusparselt_040
 
@@ -75,7 +66,7 @@ function install_118 {
 
 function install_124 {
   CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
   rm -rf /usr/local/cuda-12.4 /usr/local/cuda
   # install CUDA 12.4.1 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@@ -93,22 +84,15 @@ function install_124 {
   cd ..
   rm -rf tmp_cudnn
 
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
+  CUDA_VERSION=12.4 bash install_nccl.sh
 
   install_cusparselt_062
 
   ldconfig
 }
 
 function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
   rm -rf /usr/local/cuda-12.6 /usr/local/cuda
   # install CUDA 12.6.3 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
@@ -126,14 +110,7 @@ function install_126 {
   cd ..
   rm -rf tmp_cudnn
 
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
+  CUDA_VERSION=12.6 bash install_nccl.sh
 
   install_cusparselt_063
 
@@ -241,7 +218,7 @@ function prune_126 {
 
 function install_128 {
   CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
   rm -rf /usr/local/cuda-12.8 /usr/local/cuda
   # install CUDA 12.8.0 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
@@ -259,14 +236,7 @@ function install_128 {
   cd ..
   rm -rf tmp_cudnn
 
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
+  CUDA_VERSION=12.8 bash install_nccl.sh
 
   install_cusparselt_063
 

diff --git a/.ci/docker/common/install_cuda_aarch64.sh b/.ci/docker/common/install_cuda_aarch64.sh
@@ -3,7 +3,6 @@
 
 set -ex
 
-NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.8.0.87
 
 function install_cusparselt_063 {
@@ -18,7 +17,7 @@ function install_cusparselt_063 {
 }
 
 function install_128 {
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
   rm -rf /usr/local/cuda-12.8 /usr/local/cuda
   # install CUDA 12.8.0 in the same container
   wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
@@ -36,14 +35,7 @@ function install_128 {
   cd ..
   rm -rf tmp_cudnn
 
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
+  CUDA_VERSION=12.8 bash install_nccl.sh
 
   install_cusparselt_063
 

diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
@@ -50,8 +50,7 @@ setup_executorch() {
   pushd executorch
 
   export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
 
   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
   popd

diff --git a/.ci/docker/common/install_halide.sh b/.ci/docker/common/install_halide.sh
@@ -35,7 +35,9 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
+# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
+#       Context: https://github.com/pytorch/pytorch/issues/150420
+cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}

diff --git a/.ci/docker/common/install_nccl.sh b/.ci/docker/common/install_nccl.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -ex
+
+NCCL_VERSION=""
+if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
+  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
+elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
+else
+  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
+  exit 1
+fi
+
+if [[ -n "${NCCL_VERSION}" ]]; then
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  pushd nccl
+  make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  popd
+  rm -rf nccl
+  ldconfig
+fi
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
@@ -49,6 +49,8 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME /usr/local/cuda
 
 FROM cuda as cuda11.8

diff --git a/.ci/docker/linter-cuda/Dockerfile b/.ci/docker/linter-cuda/Dockerfile
@@ -30,7 +30,9 @@ RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 

diff --git a/.ci/docker/manywheel/Dockerfile b/.ci/docker/manywheel/Dockerfile
@@ -64,7 +64,9 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 
 FROM base as intel
 # MKL

diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
@@ -36,7 +36,9 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*
 
 FROM base as intel
 # MKL

diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -67,7 +67,9 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
 ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
-RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*
 
 FROM base as magma
 ARG BASE_CUDA_VERSION

diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -158,6 +158,16 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh
 
+# Install NCCL
+ARG CUDA_VERSION
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash install_nccl.sh
+RUN rm install_nccl.sh /ci_commit_pins/nccl-cu*
+ENV USE_SYSTEM_NCCL=1
+ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+
 # Install CUDSS
 ARG CUDA_VERSION
 COPY ./common/install_cudss.sh install_cudss.sh

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -52,9 +52,16 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
+# No effect if cuda not installed
+ENV USE_SYSTEM_NCCL=1
+ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+
 
 # (optional) Install UCC
 ARG UCX_COMMIT

diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
@@ -33,55 +33,11 @@ if which sccache > /dev/null; then
   export PATH="${tmp_dir}:$PATH"
 fi
 
-cross_compile_arm64() {
-  # Cross compilation for arm64
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
-}
-
-compile_arm64() {
-  # Compilation for arm64
-  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
-}
-
-compile_x86_64() {
-  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
-}
-
-build_lite_interpreter() {
-    echo "Testing libtorch (lite interpreter)."
-
-    CPP_BUILD="$(pwd)/../cpp_build"
-    # Ensure the removal of the tmp directory
-    trap 'rm -rfv ${CPP_BUILD}' EXIT
-    rm -rf "${CPP_BUILD}"
-    mkdir -p "${CPP_BUILD}/caffe2"
-
-    # It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
-    BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
-    pushd "${CPP_BUILD}/caffe2" || exit
-    VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
-    popd || exit
-
-    "${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
-}
-
 print_cmake_info
 
-if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
-  if [[ $(uname -m) == "arm64" ]]; then
-    compile_arm64
-  else
-    cross_compile_arm64
-  fi
-elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
-  export BUILD_LITE_INTERPRETER=1
-  build_lite_interpreter
-else
-  compile_x86_64
-fi
+# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 
 if which sccache > /dev/null; then
   print_sccache_stats
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		cedf52aa8e4df879886270a5920da6fe84cbaa67
		7e487c24e1c20c3f4606c2d8aca2778873b00b4c