From 736487d0589414e7e6a365f8eb5a2b1e01869145 Mon Sep 17 00:00:00 2001 From: Aaron Wang Shi Date: Thu, 5 Oct 2023 11:37:11 +0800 Subject: [PATCH] replace centos7 to rockylinux8 --- build-tools/make/build-with-docker.mk | 7 +- build-tools/make/hpcx_url.mk | 3 +- docker/development/Dockerfile.build-mpi | 174 +++++++++--------- .../development/Dockerfile.build-mpi-ppc64le | 163 ++++++++-------- .../Dockerfile.cuda-cudnn-lib-in-wheel-test | 172 ++++++++--------- docker/runtime/Dockerfile.runtime-mpi | 14 +- 6 files changed, 270 insertions(+), 263 deletions(-) diff --git a/build-tools/make/build-with-docker.mk b/build-tools/make/build-with-docker.mk index a3f0a7750..ee75ba703 100644 --- a/build-tools/make/build-with-docker.mk +++ b/build-tools/make/build-with-docker.mk @@ -61,7 +61,7 @@ docker_image_build_cuda: --build-arg CUDNN_VERSION=$(CUDNN_VERSION) \ --build-arg PYTHON_VERSION_MAJOR=$(PYTHON_VERSION_MAJOR) \ --build-arg PYTHON_VERSION_MINOR=$(PYTHON_VERSION_MINOR) \ - --build-arg HPCX_URL_centos=$(HPCX_URL_centos_$(OMPI_VERSION)) \ + --build-arg HPCX_URL_rhel=$(HPCX_URL_rhel_$(OMPI_VERSION)) \ -t $(DOCKER_IMAGE_BUILD_NNABLA_EXT_CUDA) \ -f docker/development/Dockerfile.build-mpi$(ARCH_SUFFIX) \ . @@ -77,7 +77,7 @@ docker_image_build_cuda_test: --build-arg PYTHON_VERSION_MAJOR=$(PYTHON_VERSION_MAJOR) \ --build-arg PYTHON_VERSION_MINOR=$(PYTHON_VERSION_MINOR) \ --build-arg BUILD_WITH_CUTENSOR=False \ - --build-arg HPCX_URL_centos=$(HPCX_URL_centos_$(OMPI_VERSION)) \ + --build-arg HPCX_URL_rhel=$(HPCX_URL_rhel_$(OMPI_VERSION)) \ -t $(DOCKER_IMAGE_TEST_NNABLA_EXT_CUDA) \ -f docker/development/Dockerfile.build-mpi$(ARCH_SUFFIX) \ . @@ -174,9 +174,10 @@ docker_image_cuda_cudnn_lib_in_wheel: cd $(NNABLA_EXT_CUDA_DIRECTORY) \ && docker build $(DOCKER_BUILD_ARGS) \ --build-arg CUDA_VERSION_MAJOR=$(CUDA_VERSION_MAJOR) \ + --build-arg CUDA_VERSION_MINOR=$(CUDA_VERSION_MINOR) \ --build-arg PYTHON_VER=3.$(PYTHON_VERSION_MINOR) \ --build-arg MPIVER=$(OMPI_VERSION) \ - --build-arg HPCX_URL_centos=$(HPCX_URL_centos_$(OMPI_VERSION)) \ + --build-arg HPCX_URL_rhel=$(HPCX_URL_rhel_$(OMPI_VERSION)) \ -f $(DOCKERFILE_PATH_LIB_IN_WHEEL) . -t nnabla-ext-cuda-lib-in-whl-py3$(PYTHON_VERSION_MINOR)-cuda$(CUDA_SUFFIX)-mpi$(OMPI_VERSION):$(DOCKER_IMAGE_ID_NNABLA_EXT_CUDA_LIB_IN_WHEEL) ######################################################################################################################## diff --git a/build-tools/make/hpcx_url.mk b/build-tools/make/hpcx_url.mk index 8f4c376a1..c0e333b0a 100644 --- a/build-tools/make/hpcx_url.mk +++ b/build-tools/make/hpcx_url.mk @@ -17,6 +17,5 @@ .SILENT: # Map specific openmpi version to HPCX download url - export HPCX_URL_ubuntu_4.1.5='https://content.mellanox.com/hpc/hpc-x/v2.12/hpcx-v2.12-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.12-x86_64.tbz' -export HPCX_URL_centos_4.1.5='https://content.mellanox.com/hpc/hpc-x/v2.12/hpcx-v2.12-gcc-MLNX_OFED_LINUX-5-redhat7-cuda11-gdrcopy2-nccl2.12-x86_64.tbz' +export HPCX_URL_rhel_4.1.5='https://content.mellanox.com/hpc/hpc-x/v2.12/hpcx-v2.12-gcc-MLNX_OFED_LINUX-5-redhat8-cuda11-gdrcopy2-nccl2.12-x86_64.tbz' diff --git a/docker/development/Dockerfile.build-mpi b/docker/development/Dockerfile.build-mpi index 9fee51220..ca0b03794 100644 --- a/docker/development/Dockerfile.build-mpi +++ b/docker/development/Dockerfile.build-mpi @@ -19,44 +19,46 @@ ARG CUDA_VERSION_MINOR=0.3 ARG CUDNN_VERSION=8 ############################################################ OpenMPI -# CentOS7: 1.10.7-1 (12) -# Ubuntu16: 1.10.2-1 (12) -# Ubuntu18: 2.1.1-1 (20) -# Ubuntu20: 4.0.3-1 (40) -# CentOS7: 3.1.3-1 (40) -# ABCI: 2.1.6-1 (20) -FROM centos:7 as openmpi +# CentOS7: 1.10.7-1 (12) +# Ubuntu16: 1.10.2-1 (12) +# Ubuntu18: 2.1.1-1 (20) +# CentOS7: 3.1.6-1 (30) +# RockyLinux8: 3.1.6-1 (30) +# Ubuntu20: 4.1.5-1 (40) +# ABCI: 4.1.5-1 (40) + +FROM rockylinux:8 as openmpi ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS ARG MPIVER=3.1.6 -ARG HPCX_URL_centos +ARG HPCX_URL_rhel ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && yum install -y \ +RUN eval ${DNF_OPTS} \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - curl \ - rpm-build \ + dnf-plugins-core \ + && dnf install -y \ + curl \ + rpm-build \ && if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') -gt 30106 ]; then \ - yum install -y libibverbs \ + dnf install -y libibverbs \ librdmacm \ rdma-core-devel \ numactl-libs \ numactl-devel \ binutils-devel \ patch; \ - fi \ - && yum group install -y "Development Tools" \ - && yum clean all + fi \ + && dnf group install -y "Development Tools" \ + && dnf clean all COPY docker/release/hpcx-init.patch /tmp/hpcx-init.patch COPY docker/release/hpcx-ompi-etc.patch /tmp/hpcx-ompi-etc.patch @@ -64,7 +66,7 @@ COPY docker/release/hpcx-ompi-etc.patch /tmp/hpcx-ompi-etc.patch RUN if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') -gt 30106 ]; then \ mkdir /root/openmpi-hpcx \ && cd /root/openmpi-hpcx \ - && curl ${CURL_OPTS} -LO ${HPCX_URL_centos} \ + && curl ${CURL_OPTS} -LO ${HPCX_URL_rhel} \ && tar -xvf hpcx*.tbz \ && rm -f hpcx*.tbz \ && mv hpcx* hpcx \ @@ -87,64 +89,63 @@ RUN if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') - && mv /root/rpmbuild/RPMS/x86_64/openmpi-${MPIVER}-1.*.rpm /root; \ fi -FROM nvidia/cuda:${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-cudnn${CUDNN_VERSION}-devel-centos7 +FROM nvidia/cuda:${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-cudnn${CUDNN_VERSION}-devel-rockylinux8 ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS ARG BUILD_WITH_CUTENSOR=True ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub -o D42D0685.pub \ +RUN eval ${DNF_OPTS} \ + && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub -o D42D0685.pub \ && rpm --import D42D0685.pub \ - && yum install -y \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - ca-certificates \ - curl \ - freetype-devel \ - git \ - hdf5 \ - hdf5-devel \ - lapack-devel \ - libjpeg-devel \ - xz-devel \ - libpng-devel \ - redhat-lsb-core \ - rpm-build \ - unzip \ - wget \ - which \ - zip \ - zlib-static \ - libmpc-devel \ - mpfr-devel \ - gmp-devel \ - openssl-devel \ - bzip2-devel \ - libffi-devel \ - nsight-systems-2021.1.3 \ - libsndfile \ - python-backports-lzma \ - openssl11 \ - openssl11-devel \ - zlib-devel \ - bzip2 bzip2-devel \ - readline-devel \ - sqlite \ - sqlite-devel \ - tk-devel \ - numactl-libs \ - && if [ "${BUILD_WITH_CUTENSOR}" == "True" ]; then yum install -y libcutensor-devel; fi \ - && yum group install -y "Development Tools" \ - && yum clean all + dnf-plugins-core \ + && dnf config-manager --set-enabled powertools \ + && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \ + && dnf install -y \ + ca-certificates \ + curl \ + freetype-devel \ + git \ + hdf5 \ + hdf5-devel \ + lapack-devel \ + libjpeg-devel \ + xz-devel \ + libpng-devel \ + redhat-lsb-core \ + rpm-build \ + unzip \ + wget \ + which \ + zip \ + zlib-static \ + libmpc-devel \ + mpfr-devel \ + gmp-devel \ + openssl-devel \ + bzip2-devel \ + libffi-devel \ + nsight-systems-2023.2.3 \ + libsndfile \ + zlib-devel \ + bzip2 bzip2-devel \ + readline-devel \ + sqlite \ + sqlite-devel \ + tk-devel \ + numactl-libs \ + && if [ "${BUILD_WITH_CUTENSOR}" == "True" ]; then dnf install -y libcutensor-devel; fi \ + && dnf group install -y "Development Tools" \ + && dnf clean all ################################################### nvidia ARG CUDA_VERSION_MAJOR=10 @@ -152,13 +153,13 @@ ARG CUDA_VERSION_MINOR=0 RUN mkdir /tmp/deps \ && cd /tmp/deps \ - && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm -o nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm \ - && yum install -y nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm \ - && yum clean all \ + && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/machine-learning/repos/rhel8/x86_64/nvidia-machine-learning-repo-rhel8-1.0.0-1.x86_64.rpm -o nvidia-machine-learning-repo-rhel8-1.0.0-1.x86_64.rpm \ + && dnf install -y nvidia-machine-learning-repo-rhel8-1.0.0-1.x86_64.rpm \ + && dnf clean all \ && cd / \ && rm -rf /tmp/* -RUN yum install -y libnccl-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} libnccl-devel-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} +RUN dnf install -y libnccl-*-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} libnccl-devel-*-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} ################################################### gcc RUN mkdir /tmp/deps \ @@ -176,7 +177,7 @@ RUN mkdir /tmp/deps \ ARG CMAKEVER=3.18.4 RUN mkdir /tmp/deps \ && cd /tmp/deps \ - && yum install -y cmake3 openssl-devel \ + && dnf install -y cmake3 openssl-devel \ && curl ${CURL_OPTS} -L https://github.com/Kitware/CMake/releases/download/v${CMAKEVER}/cmake-${CMAKEVER}.tar.gz -o cmake-${CMAKEVER}.tar.gz \ && tar xf cmake-${CMAKEVER}.tar.gz \ && cd cmake-${CMAKEVER} \ @@ -185,9 +186,9 @@ RUN mkdir /tmp/deps \ && cmake3 -DBUILD_TESTING=FALSE .. \ && make -j8 \ && make install \ - && yum remove -y cmake3 \ - && yum clean all \ - && rm -rf /var/cache/yum/* \ + && dnf remove -y cmake3 \ + && dnf clean all \ + && rm -rf /var/cache/dnf/* \ && cd / \ && rm -rf /tmp/* @@ -201,11 +202,11 @@ RUN mkdir /tmp/deps \ && mkdir build \ && cd build \ && cmake \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -Dprotobuf_BUILD_TESTS=OFF \ - -DCMAKE_CXX_STANDARD=14 \ - -D CMAKE_C_COMPILER=gcc CMAKE_CXX_COMPILER=g++ /usr/bin/gcc \ - ../cmake \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_CXX_STANDARD=14 \ + -D CMAKE_C_COMPILER=gcc CMAKE_CXX_COMPILER=g++ /usr/bin/gcc \ + ../cmake \ && make -j8 \ && make install \ && cd / \ @@ -220,13 +221,13 @@ RUN mkdir /tmp/deps \ && mkdir libarchive-build \ && cd libarchive-build \ && cmake \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_NETTLE=FALSE -DENABLE_OPENSSL=FALSE \ - -DENABLE_LZO=FALSE -DENABLE_LZMA=FALSE -DENABLE_BZip2=FALSE \ - -DENABLE_LIBXML2=FALSE -DENABLE_EXPAT=FALSE -DENABLE_PCREPOSIX=FALSE -DENABLE_LibGCC=FALSE \ - -DENABLE_CNG=FALSE -DENABLE_TAR=FALSE -DENABLE_TAR_SHARED=FALSE -DENABLE_CPIO=FALSE \ - -DENABLE_CPIO_SHARED=FALSE -DENABLE_CAT=FALSE -DENABLE_CAT_SHARED=FALSE -DENABLE_XATTR=FALSE \ - -DENABLE_ACL=FALSE -DENABLE_ICONV=FALSE -DENABLE_TEST=FALSE \ - ../libarchive-${LIBARCHIVEVER} \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_NETTLE=FALSE -DENABLE_OPENSSL=FALSE \ + -DENABLE_LZO=FALSE -DENABLE_LZMA=FALSE -DENABLE_BZip2=FALSE \ + -DENABLE_LIBXML2=FALSE -DENABLE_EXPAT=FALSE -DENABLE_PCREPOSIX=FALSE -DENABLE_LibGCC=FALSE \ + -DENABLE_CNG=FALSE -DENABLE_TAR=FALSE -DENABLE_TAR_SHARED=FALSE -DENABLE_CPIO=FALSE \ + -DENABLE_CPIO_SHARED=FALSE -DENABLE_CAT=FALSE -DENABLE_CAT_SHARED=FALSE -DENABLE_XATTR=FALSE \ + -DENABLE_ACL=FALSE -DENABLE_ICONV=FALSE -DENABLE_TEST=FALSE \ + ../libarchive-${LIBARCHIVEVER} \ && make -j8 \ && make install \ && cd / \ @@ -262,7 +263,6 @@ RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv \ && export PYTHON_BUILD_CURL_OPTS="${CURL_OPTS}" \ && export PYTHON_BUILD_WGET_OPTS="${WGET_OPTS}" \ && export PYTHON_CONFIGURE_OPTS=--disable-shared \ - && if [ ${PYTHON_VERSION_MINOR} -ge 10 ]; then export CPPFLAGS=-I/usr/include/openssl11 && export LDFLAGS=-L/usr/lib64/openssl11; fi \ && eval "$(pyenv init -)" \ && python-build `pyenv latest -k ${PYVERNAME}` /usr/local \ && pyenv global system \ @@ -275,7 +275,7 @@ RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv \ && rm -rf ~/.pyenv/.git /tmp/* RUN rm -f /usr/lib64/libstdc++.so.6 -ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2021.1.3/host-linux-x64/:$PATH +ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/:$PATH ENV LD_LIBRARY_PATH /usr/local/lib64:$LD_LIBRARY_PATH ENV CC /usr/local/bin/gcc ENV CXX /usr/local/bin/g++ diff --git a/docker/development/Dockerfile.build-mpi-ppc64le b/docker/development/Dockerfile.build-mpi-ppc64le index 2257806af..1ef6b667a 100644 --- a/docker/development/Dockerfile.build-mpi-ppc64le +++ b/docker/development/Dockerfile.build-mpi-ppc64le @@ -19,33 +19,35 @@ ARG CUDA_VERSION_MINOR=2 ARG CUDNN_VERSION=8 ############################################################ OpenMPI -# CentOS7: 1.10.7 (12) -# Ubuntu16: 1.10.2 (12) -# Ubuntu18: 2.1.1 (20) -# Ubuntu20: 4.0.3 (40) -# CentOS7: 3.1.3 (40) -# ABCI: 2.1.6 (20) -FROM centos:7 as openmpi +# CentOS7: 1.10.7-1 (12) +# Ubuntu16: 1.10.2-1 (12) +# Ubuntu18: 2.1.1-1 (20) +# CentOS7: 3.1.6-1 (30) +# RockyLinux8: 3.1.6-1 (30) +# Ubuntu20: 4.1.5-1 (40) +# ABCI: 4.1.5-1 (40) + +FROM ppc64le/rockylinux:9 as openmpi ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && yum install -y \ +RUN eval ${DNF_OPTS} \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - curl \ - rpm-build \ - && yum group install -y "Development Tools" \ - && yum clean all + dnf-plugins-core \ + && dnf install -y \ + curl \ + rpm-build \ + && dnf group install -y "Development Tools" \ + && dnf clean all ARG MPIVER=2.1.6 RUN mkdir /root/rpmbuild @@ -53,63 +55,61 @@ RUN cd /root/rpmbuild; curl ${CURL_OPTS} https://download.open-mpi.org/release/o RUN cd /root/rpmbuild; rpmbuild --rebuild openmpi-${MPIVER}-1.src.rpm RUN mv /root/rpmbuild/RPMS/ppc64le/openmpi-${MPIVER}-1.*.rpm /root -FROM nvidia/cuda-ppc64le:${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-cudnn${CUDNN_VERSION}-devel-centos7 +FROM nvidia/cuda-ppc64le:${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}-cudnn${CUDNN_VERSION}-devel-rockylinux8 ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS ARG BUILD_WITH_CUTENSOR=True ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub -o D42D0685.pub \ +RUN eval ${DNF_OPTS} \ + && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub -o D42D0685.pub \ && rpm --import D42D0685.pub \ - && yum install -y \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - ca-certificates \ - curl \ - freetype-devel \ - git \ - hdf5 \ - hdf5-devel \ - lapack-devel \ - libjpeg-devel \ - xz-devel \ - libpng-devel \ - redhat-lsb-core \ - rpm-build \ - unzip \ - wget \ - which \ - zip \ - zlib-static \ - libmpc-devel \ - mpfr-devel \ - gmp-devel \ - openssl-devel \ - bzip2-devel \ - libffi-devel \ - libsndfile \ - python-backports-lzma \ - openssl11 \ - openssl11-devel \ - zlib-devel \ - bzip2 \ - readline-devel \ - sqlite \ - sqlite-devel \ - tk-devel \ - xz-devel \ - && if [ "${BUILD_WITH_CUTENSOR}" == "True" ]; then yum install -y libcutensor-devel; fi \ - && yum group install -y "Development Tools" \ - && yum clean all + dnf-plugins-core \ + && dnf config-manager --set-enabled powertools \ + && dnf install -y \ + ca-certificates \ + curl \ + freetype-devel \ + git \ + hdf5 \ + hdf5-devel \ + lapack-devel \ + libjpeg-devel \ + xz-devel \ + libpng-devel \ + redhat-lsb-core \ + rpm-build \ + unzip \ + wget \ + which \ + zip \ + zlib-static \ + libmpc-devel \ + mpfr-devel \ + gmp-devel \ + openssl-devel \ + bzip2-devel \ + libffi-devel \ + libsndfile \ + zlib-devel \ + bzip2 \ + readline-devel \ + sqlite \ + sqlite-devel \ + tk-devel \ + xz-devel \ + && if [ "${BUILD_WITH_CUTENSOR}" == "True" ]; then dnf install -y libcutensor-devel; fi \ + && dnf group install -y "Development Tools" \ + && dnf clean all ################################################### nvidia ARG CUDA_VERSION_MAJOR=10 @@ -117,14 +117,14 @@ ARG CUDA_VERSION_MINOR=0 RUN mkdir /tmp/deps \ && cd /tmp/deps \ - && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/ppc64le/nvidia-machine-learning-repo-rhel7-1.0.0-1.ppc64le.rpm -o nvidia-machine-learning-repo-rhel7-1.0.0-1.ppc64le.rpm \ - && yum install -y ./nvidia-machine-learning-repo-rhel7-1.0.0-1.ppc64le.rpm \ - && yum clean all \ + && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/machine-learning/repos/rhel8/ppc64le/nvidia-machine-learning-repo-rhel8-1.0.0-1.ppc64le.rpm -o nvidia-machine-learning-repo-rhel8-1.0.0-1.ppc64le.rpm \ + && dnf install -y ./nvidia-machine-learning-repo-rhel8-1.0.0-1.ppc64le.rpm \ + && dnf clean all \ && cd / \ && rm -rf /tmp/* -RUN yum install -y libnccl-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} libnccl-devel-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} +RUN dnf install -y libnccl-*-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} libnccl-devel-*-*+cuda${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR%.?} ################################################### gcc RUN mkdir /tmp/deps \ @@ -142,7 +142,7 @@ RUN mkdir /tmp/deps \ ARG CMAKEVER=3.14.3 RUN mkdir /tmp/deps \ && cd /tmp/deps \ - && yum install -y cmake3 \ + && dnf install -y cmake3 \ && curl ${CURL_OPTS} -L https://github.com/Kitware/CMake/releases/download/v${CMAKEVER}/cmake-${CMAKEVER}.tar.gz -o cmake-${CMAKEVER}.tar.gz \ && tar xf cmake-${CMAKEVER}.tar.gz \ && cd cmake-${CMAKEVER} \ @@ -151,9 +151,9 @@ RUN mkdir /tmp/deps \ && cmake3 -DBUILD_TESTING=FALSE .. \ && make \ && make install \ - && yum remove -y cmake3 \ - && yum clean all \ - && rm -rf /var/cache/yum/* \ + && dnf remove -y cmake3 \ + && dnf clean all \ + && rm -rf /var/cache/dnf/* \ && cd / \ && rm -rf /tmp/* @@ -167,11 +167,11 @@ RUN mkdir /tmp/deps \ && mkdir build \ && cd build \ && cmake \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -Dprotobuf_BUILD_TESTS=OFF \ - -DCMAKE_CXX_STANDARD=14 \ - -D CMAKE_C_COMPILER=gcc CMAKE_CXX_COMPILER=g++ /usr/bin/gcc \ - ../cmake \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -Dprotobuf_BUILD_TESTS=OFF \ + -DCMAKE_CXX_STANDARD=14 \ + -D CMAKE_C_COMPILER=gcc CMAKE_CXX_COMPILER=g++ /usr/bin/gcc \ + ../cmake \ && make \ && make install \ && cd / \ @@ -185,13 +185,13 @@ RUN mkdir /tmp/deps \ && mkdir libarchive-build \ && cd libarchive-build \ && cmake \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_NETTLE=FALSE -DENABLE_OPENSSL=FALSE \ - -DENABLE_LZO=FALSE -DENABLE_LZMA=FALSE -DENABLE_BZip2=FALSE \ - -DENABLE_LIBXML2=FALSE -DENABLE_EXPAT=FALSE -DENABLE_PCREPOSIX=FALSE -DENABLE_LibGCC=FALSE \ - -DENABLE_CNG=FALSE -DENABLE_TAR=FALSE -DENABLE_TAR_SHARED=FALSE -DENABLE_CPIO=FALSE \ - -DENABLE_CPIO_SHARED=FALSE -DENABLE_CAT=FALSE -DENABLE_CAT_SHARED=FALSE -DENABLE_XATTR=FALSE \ - -DENABLE_ACL=FALSE -DENABLE_ICONV=FALSE -DENABLE_TEST=FALSE \ - ../libarchive-3.3.2 \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_NETTLE=FALSE -DENABLE_OPENSSL=FALSE \ + -DENABLE_LZO=FALSE -DENABLE_LZMA=FALSE -DENABLE_BZip2=FALSE \ + -DENABLE_LIBXML2=FALSE -DENABLE_EXPAT=FALSE -DENABLE_PCREPOSIX=FALSE -DENABLE_LibGCC=FALSE \ + -DENABLE_CNG=FALSE -DENABLE_TAR=FALSE -DENABLE_TAR_SHARED=FALSE -DENABLE_CPIO=FALSE \ + -DENABLE_CPIO_SHARED=FALSE -DENABLE_CAT=FALSE -DENABLE_CAT_SHARED=FALSE -DENABLE_XATTR=FALSE \ + -DENABLE_ACL=FALSE -DENABLE_ICONV=FALSE -DENABLE_TEST=FALSE \ + ../libarchive-3.3.2 \ && make install \ && cd / \ && rm -rf /tmp/* @@ -224,7 +224,6 @@ RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv \ && export PYTHON_BUILD_CURL_OPTS="${CURL_OPTS}" \ && export PYTHON_BUILD_WGET_OPTS="${WGET_OPTS}" \ && export PYTHON_CONFIGURE_OPTS=--disable-shared \ - && if [ ${PYTHON_VERSION_MINOR} -ge 10 ]; then export CPPFLAGS=-I/usr/include/openssl11 && export LDFLAGS=-L/usr/lib64/openssl11; fi \ && eval "$(pyenv init -)" \ && python-build `pyenv latest -k ${PYVERNAME}` /usr/local \ && pyenv global system \ @@ -233,7 +232,7 @@ RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv \ && rm -rf ~/.pyenv/.git /tmp/* RUN rm -f /usr/lib64/libstdc++.so.6 -ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2021.1.3/host-linux-x64/:$PATH +ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/:$PATH ENV LD_LIBRARY_PATH /usr/local/lib64:$LD_LIBRARY_PATH ENV CC /usr/local/bin/gcc ENV CXX /usr/local/bin/g++ diff --git a/docker/development/Dockerfile.cuda-cudnn-lib-in-wheel-test b/docker/development/Dockerfile.cuda-cudnn-lib-in-wheel-test index 62195e34d..e1570e68c 100644 --- a/docker/development/Dockerfile.cuda-cudnn-lib-in-wheel-test +++ b/docker/development/Dockerfile.cuda-cudnn-lib-in-wheel-test @@ -13,44 +13,46 @@ # limitations under the License. ############################################################ OpenMPI -# CentOS7: 1.10.7-1 (12) -# Ubuntu16: 1.10.2-1 (12) -# Ubuntu18: 2.1.1-1 (20) -# Ubuntu20: 4.0.3-1 (40) -# CentOS7: 3.1.3-1 (40) -# ABCI: 2.1.6-1 (20) -FROM centos:7 as openmpi +# CentOS7: 1.10.7-1 (12) +# Ubuntu16: 1.10.2-1 (12) +# Ubuntu18: 2.1.1-1 (20) +# CentOS7: 3.1.6-1 (30) +# RockyLinux8: 3.1.6-1 (30) +# Ubuntu20: 4.1.5-1 (40) +# ABCI: 4.1.5-1 (40) + +FROM rockylinux:8 as openmpi ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS ARG MPIVER=3.1.6 -ARG HPCX_URL_centos +ARG HPCX_URL_rhel ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && yum install -y \ +RUN eval ${DNF_OPTS} \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - curl \ - rpm-build \ + dnf-plugins-core \ + && dnf install -y \ + curl \ + rpm-build \ && if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') -gt 30106 ]; then \ - yum install -y libibverbs \ + dnf install -y libibverbs \ librdmacm \ rdma-core-devel \ numactl-libs \ numactl-devel \ binutils-devel \ patch; \ - fi \ - && yum group install -y "Development Tools" \ - && yum clean all + fi \ + && dnf group install -y "Development Tools" \ + && dnf clean all COPY docker/release/hpcx-init.patch /tmp/hpcx-init.patch COPY docker/release/hpcx-ompi-etc.patch /tmp/hpcx-ompi-etc.patch @@ -58,7 +60,7 @@ COPY docker/release/hpcx-ompi-etc.patch /tmp/hpcx-ompi-etc.patch RUN if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') -gt 30106 ]; then \ mkdir /root/openmpi-hpcx \ && cd /root/openmpi-hpcx \ - && curl ${CURL_OPTS} -LO ${HPCX_URL_centos} \ + && curl ${CURL_OPTS} -LO ${HPCX_URL_rhel} \ && tar -xvf hpcx*.tbz \ && rm -f hpcx*.tbz \ && mv hpcx* hpcx \ @@ -81,62 +83,70 @@ RUN if [ $(echo "${MPIVER}" | awk -F. '{ printf("%d%02d%02d\n", $1,$2,$3); }') - && mv /root/rpmbuild/RPMS/x86_64/openmpi-${MPIVER}-1.*.rpm /root; \ fi -FROM centos:7 +FROM rockylinux:8 ARG PIP_INS_OPTS ARG PYTHONWARNINGS ARG CURL_OPTS ARG WGET_OPTS -ARG YUM_OPTS +ARG DNF_OPTS + +ARG CUDA_VERSION_MAJOR +ARG CUDA_VERSION_MINOR ENV LC_ALL C ENV LANG C ENV LANGUAGE C -RUN eval ${YUM_OPTS} \ - && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub -o D42D0685.pub \ +RUN eval ${DNF_OPTS} \ + && curl ${CURL_OPTS} -L https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub -o D42D0685.pub \ && rpm --import D42D0685.pub \ - && yum install -y \ + && dnf install -y \ epel-release \ - yum-utils \ - && yum install -y \ - ca-certificates \ - curl \ - freetype-devel \ - git \ - hdf5 \ - hdf5-devel \ - lapack-devel \ - libjpeg-devel \ - xz-devel \ - libpng-devel \ - redhat-lsb-core \ - rpm-build \ - unzip \ - wget \ - which \ - zip \ - zlib-static \ - libmpc-devel \ - mpfr-devel \ - gmp-devel \ - openssl-devel \ - bzip2-devel \ - libffi-devel \ - nsight-systems-2021.1.3 \ - libsndfile \ - python-backports-lzma \ - openssl11 \ - openssl11-devel \ - zlib-devel \ - bzip2 \ - readline-devel \ - sqlite \ - sqlite-devel \ - tk-devel \ - numactl-libs \ - && yum group install -y "Development Tools" \ - && yum clean all + dnf-plugins-core \ + && dnf config-manager --set-enabled powertools \ + && dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \ + && dnf install -y \ + ca-certificates \ + curl \ + freetype-devel \ + git \ + hdf5 \ + hdf5-devel \ + lapack-devel \ + libjpeg-devel \ + xz-devel \ + libpng-devel \ + redhat-lsb-core \ + rpm-build \ + unzip \ + wget \ + which \ + zip \ + zlib-static \ + libmpc-devel \ + mpfr-devel \ + gmp-devel \ + openssl-devel \ + bzip2-devel \ + libffi-devel \ + libsndfile \ + zlib-devel \ + bzip2 \ + readline-devel \ + sqlite \ + sqlite-devel \ + tk-devel \ + numactl-libs \ + && dnf install -y \ + nsight-systems-2023.2.3 \ + cuda-toolkit-config-common \ + cuda-toolkit-${CUDA_VERSION_MAJOR}-config-common \ + cuda-toolkit-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR%.*}-config-common \ + cuda-nvtx-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR%.*} \ + cuda-nvprof-${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR%.*} \ + && dnf group install -y "Development Tools" \ + && dnf clean all ################################################### gcc RUN mkdir /tmp/deps \ @@ -154,7 +164,7 @@ RUN mkdir /tmp/deps \ ARG CMAKEVER=3.18.4 RUN mkdir /tmp/deps \ && cd /tmp/deps \ - && yum install -y cmake3 openssl-devel \ + && dnf install -y cmake3 openssl-devel \ && curl ${CURL_OPTS} -L https://github.com/Kitware/CMake/releases/download/v${CMAKEVER}/cmake-${CMAKEVER}.tar.gz -o cmake-${CMAKEVER}.tar.gz \ && tar xf cmake-${CMAKEVER}.tar.gz \ && cd cmake-${CMAKEVER} \ @@ -163,9 +173,9 @@ RUN mkdir /tmp/deps \ && cmake3 -DBUILD_TESTING=FALSE .. \ && make -j8 \ && make install \ - && yum remove -y cmake3 \ - && yum clean all \ - && rm -rf /var/cache/yum/* \ + && dnf remove -y cmake3 \ + && dnf clean all \ + && rm -rf /var/cache/dnf/* \ && cd / \ && rm -rf /tmp/* @@ -179,27 +189,25 @@ ARG PYTHON_VER ADD python/requirements.txt /tmp/deps/ ARG ARCH_SUFFIX -ARG CUDA_VERSION_MAJOR ################################################## build python from pyenv RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv \ - && export PYENV_ROOT="$HOME/.pyenv" \ - && export PATH="$PYENV_ROOT/bin:$PYENV_ROOT/plugins/python-build/bin:$PATH" \ - && export PYTHON_BUILD_CURL_OPTS="${CURL_OPTS}" \ - && export PYTHON_BUILD_WGET_OPTS="${WGET_OPTS}" \ + && export PYENV_ROOT="$HOME/.pyenv" \ + && export PATH="$PYENV_ROOT/bin:$PYENV_ROOT/plugins/python-build/bin:$PATH" \ + && export PYTHON_BUILD_CURL_OPTS="${CURL_OPTS}" \ + && export PYTHON_BUILD_WGET_OPTS="${WGET_OPTS}" \ && export PYTHON_CONFIGURE_OPTS=--disable-shared \ && PYTHON_VERSION_MINOR=${PYTHON_VER#*.} \ - && if [ ${PYTHON_VERSION_MINOR} -ge 10 ]; then export CPPFLAGS=-I/usr/include/openssl11 && export LDFLAGS=-L/usr/lib64/openssl11; fi \ - && eval "$(pyenv init -)" \ - && python-build `pyenv latest -k ${PYTHON_VER}` /usr/local \ - && pyenv global system \ - && pip install ${PIP_INS_OPTS} --no-cache-dir -U pip \ - && pip install ${PIP_INS_OPTS} --no-cache-dir -U -r /tmp/deps/requirements.txt \ - && pip install ${PIP_INS_OPTS} --no-cache-dir --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION_MAJOR}0 \ - && rm -rf ~/.pyenv/.git /tmp/* + && eval "$(pyenv init -)" \ + && python-build `pyenv latest -k ${PYTHON_VER}` /usr/local \ + && pyenv global system \ + && pip install ${PIP_INS_OPTS} --no-cache-dir -U pip \ + && pip install ${PIP_INS_OPTS} --no-cache-dir -U -r /tmp/deps/requirements.txt \ + && pip install ${PIP_INS_OPTS} --no-cache-dir --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION_MAJOR}0 \ + && rm -rf ~/.pyenv/.git /tmp/* RUN rm -f /usr/lib64/libstdc++.so.6 -ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2021.1.3/host-linux-x64/:$PATH +ENV PATH /tmp/.local/bin:/opt/nvidia/nsight-systems/2023.2.3/host-linux-x64/:$PATH ENV LD_LIBRARY_PATH /usr/local/lib64:$LD_LIBRARY_PATH ENV CC /usr/local/bin/gcc ENV CXX /usr/local/bin/g++ diff --git a/docker/runtime/Dockerfile.runtime-mpi b/docker/runtime/Dockerfile.runtime-mpi index 14e0782c8..362073ad2 100644 --- a/docker/runtime/Dockerfile.runtime-mpi +++ b/docker/runtime/Dockerfile.runtime-mpi @@ -17,12 +17,13 @@ ARG BASE ############################################################ # Build OpenMPI ############################################################ -# CentOS7: 1.10.7 (12) -# Ubuntu16: 1.10.2 (12) -# Ubuntu18: 2.1.1 (20) -# Ubuntu20: 4.0.3 (40) -# CentOS7: 3.1.3 (40) -# ABCI: 2.1.6 (20) +# CentOS7: 1.10.7-1 (12) +# Ubuntu16: 1.10.2-1 (12) +# Ubuntu18: 2.1.1-1 (20) +# CentOS7: 3.1.6-1 (30) +# RockyLinux8: 3.1.6-1 (30) +# Ubuntu20: 4.1.5-1 (40) +# ABCI: 4.1.5-1 (40) FROM ubuntu:20.04 as openmpi @@ -115,7 +116,6 @@ RUN eval ${APT_OPTS} \ && export PYTHON_BUILD_CURL_OPTS="${CURL_OPTS}" \ && export PYTHON_BUILD_WGET_OPTS="${WGET_OPTS}" \ && export PYTHON_CONFIGURE_OPTS=--disable-shared \ - && if [ ${PYTHON_VERSION_MINOR} -ge 10 ]; then export CPPFLAGS=-I/usr/include/openssl11 && export LDFLAGS=-L/usr/lib64/openssl11; fi \ && eval "$(pyenv init -)" \ && python-build `pyenv latest -k ${PYVERNAME}` /usr/local \ && pyenv global system \