diff --git a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu index c44b4213d745..a23c9768a881 100644 --- a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu +++ b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu @@ -34,6 +34,7 @@ ENV CUDNN_VERSION=8.0.5.39 ENV NCCL_VERSION=2.7.8 ENV HOROVOD_VERSION=0.21.3 ENV EFA_VERSION=1.11.2 +ENV OMPI_VERSION=4.1.1 ENV BRANCH_OFI=1.1.1 ENV DGLBACKEND=pytorch ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" @@ -93,17 +94,30 @@ RUN cd /tmp \ && make -j64 src.build BUILDDIR=/usr/local \ && rm -rf /tmp/nccl -# Install EFA along with AWS OPEN_MPI +# Install EFA along without AWS OPEN_MPI RUN mkdir /tmp/efa \ && cd /tmp/efa \ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf $OPEN_MPI_PATH \ && rm -rf /tmp/efa \ && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz -RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf +# Install OpenMPI without libfabric support +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz && \ + tar zxf openmpi-${OMPI_VERSION}.tar.gz && \ + cd openmpi-${OMPI_VERSION} && \ + ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi + ENV PATH="$OPEN_MPI_PATH/bin:$PATH" ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH diff --git a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu index e47b9d32d983..93789fc117be 100644 --- a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu +++ b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu @@ -31,6 +31,7 @@ ARG OPEN_MPI_PATH=/opt/amazon/openmpi ARG EFA_PATH=/opt/amazon/efa ARG NCCL_VERSION=2.7.8 ARG EFA_VERSION=1.11.2 +ARG OMPI_VERSION=4.1.1 ARG BRANCH_OFI=1.1.1 ARG TF_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/tensorflow/r2.4_aws/20210127-150238/gpu/py37/cu110/tensorflow_gpu-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl @@ -104,16 +105,30 @@ RUN cd /tmp \ && make -j64 src.build BUILDDIR=/usr/local \ && rm -rf /tmp/nccl -# Install EFA along with AWS OPEN_MPI +# Install EFA along without AWS OPEN_MPI RUN mkdir /tmp/efa \ && cd /tmp/efa \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-$EFA_VERSION.tar.gz \ - && tar -xf aws-efa-installer-$EFA_VERSION.tar.gz \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf $OPEN_MPI_PATH \ && rm -rf /tmp/efa \ && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz +# Install OpenMPI without libfabric support +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz && \ + tar zxf openmpi-${OMPI_VERSION}.tar.gz && \ + cd openmpi-${OMPI_VERSION} && \ + ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + cd / && \ + rm -rf /tmp/openmpi + RUN wget https://sourceforge.net/projects/boost/files/boost/1.73.0/boost_1_73_0.tar.gz/download -O boost_1_73_0.tar.gz \ && tar -xzf boost_1_73_0.tar.gz \ && cd boost_1_73_0 \ @@ -141,7 +156,6 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa # Set default NCCL parameters RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf -RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH # /usr/local/lib/libpython* needs to be accessible for dynamic linking ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH