diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index 95d96a1a0..9127a7f3a 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -44,7 +44,7 @@ jobs: run: | libtorch/build_docker.sh build-docker-rocm: - runs-on: ubuntu-22.04 + runs-on: linux.12xlarge strategy: matrix: rocm_version: ["5.4.2", "5.5"] diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 4fa947118..6404796a0 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -46,7 +46,7 @@ jobs: run: | manywheel/build_docker.sh build-docker-rocm: - runs-on: ubuntu-22.04 + runs-on: linux.12xlarge strategy: matrix: rocm_version: ["5.4.2", "5.5"] diff --git a/common/install_miopen.sh b/common/install_miopen.sh index 69de4fe5b..8d40545e5 100644 --- a/common/install_miopen.sh +++ b/common/install_miopen.sh @@ -33,9 +33,7 @@ if [[ $ROCM_INT -lt 40001 ]]; then exit 0 fi -# CHANGED: Do not uninstall. To avoid out of disk space issues, we will copy lib over existing. -# Uninstall existing package, to avoid errors during later yum install indicating packages did not change. -#yum remove -y miopen-hip +yum remove -y miopen-hip # Function to retry functions that sometimes timeout or have flaky failures retry () { @@ -77,16 +75,6 @@ elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" MIOPEN_BRANCH="release/rocm-rel-5.0-staging" -elif [[ $ROCM_INT -ge 40500 ]] && [[ $ROCM_INT -lt 50000 ]]; then - MIOPEN_CMAKE_COMMON_FLAGS="${MIOPEN_CMAKE_COMMON_FLAGS} -DMIOPEN_USE_HIP_KERNELS=Off -DMIOPEN_DEFAULT_FIND_MODE=Normal" - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36" - MIOPEN_BRANCH="release/rocm-rel-4.5-staging" -elif [[ $ROCM_INT -ge 40300 ]] && [[ $ROCM_INT -lt 40500 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx900_64;gfx906_60;gfx906_64;gfx90878;gfx1030_36" - MIOPEN_BRANCH="release/rocm-rel-4.3-staging" -elif [[ $ROCM_INT -ge 40200 ]] && [[ $ROCM_INT -lt 40300 ]]; then - MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx803_36;gfx803_64;gfx900_56;gfx900_64;gfx906_60;gfx906_64;gfx90878" - MIOPEN_BRANCH="rocm-4.2.x-staging" else echo "Unhandled ROCM_VERSION ${ROCM_VERSION}" exit 1 @@ -94,7 +82,7 @@ fi git clone https://github.com/ROCmSoftwarePlatform/MIOpen -b ${MIOPEN_BRANCH} pushd MIOpen -# remove .git to save disk space ince CI runner was running out +# remove .git to save disk space since CI runner was running out rm -rf .git # Don't build MLIR to save docker build time # since we are disabling MLIR backend for MIOpen anyway @@ -122,18 +110,13 @@ PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}" make MIOpen -j $(nproc) -# CHANGED: Do not build package. # Build MIOpen package -#make -j $(nproc) package +make -j $(nproc) package # clean up since CI runner was running out of disk space rm -rf /usr/local/cget -# CHANGED: Do not install package, just copy lib over existing. -#yum install -y miopen-*.rpm -dest=$(ls ${ROCM_INSTALL_PATH}/lib/libMIOpen.so.1.0.*) -rm -f ${dest} -cp lib/libMIOpen.so.1.0 ${dest} +yum install -y miopen-*.rpm popd rm -rf MIOpen diff --git a/common/install_rocm_magma.sh b/common/install_rocm_magma.sh index 73c0dbe55..c37c1e30a 100644 --- a/common/install_rocm_magma.sh +++ b/common/install_rocm_magma.sh @@ -32,7 +32,7 @@ else amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs` fi for arch in $amdgpu_targets; do - echo "DEVCCFLAGS += --amdgpu-target=$arch" >> make.inc + echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc done # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition sed -i 's/^FOPENMP/#FOPENMP/g' make.inc diff --git a/libtorch/Dockerfile b/libtorch/Dockerfile index 84dffe0d8..cfc0a1e05 100644 --- a/libtorch/Dockerfile +++ b/libtorch/Dockerfile @@ -60,24 +60,18 @@ FROM cpu as rocm ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ENV MKLROOT /opt/intel -ADD ./common/install_rocm.sh install_rocm.sh +# No need to install ROCm as base docker image should have full ROCm install +#ADD ./common/install_rocm.sh install_rocm.sh ADD ./common/install_rocm_drm.sh install_rocm_drm.sh -#ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh # gfortran and python needed for building magma from source for ROCm RUN apt-get update -y && \ apt-get install gfortran -y && \ apt-get install python -y && \ apt-get clean -FROM rocm as rocm5.4.2 -RUN ROCM_VERSION=5.4.2 bash ./install_rocm.sh && rm install_rocm.sh RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh -#RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh - -FROM rocm as rocm5.5 -RUN ROCM_VERSION=5.5 bash ./install_rocm.sh && rm install_rocm.sh -RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh -#RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh FROM ${BASE_TARGET} as final # Install LLVM diff --git a/libtorch/build_docker.sh b/libtorch/build_docker.sh index 50d75c709..299c8ae43 100755 --- a/libtorch/build_docker.sh +++ b/libtorch/build_docker.sh @@ -25,9 +25,9 @@ case ${GPU_ARCH_TYPE} in DOCKER_GPU_BUILD_ARG="" ;; rocm) - BASE_TARGET=rocm${GPU_ARCH_VERSION} + BASE_TARGET=rocm DOCKER_TAG=rocm${GPU_ARCH_VERSION} - GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-magma + GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then diff --git a/manywheel/Dockerfile b/manywheel/Dockerfile index 5ccb5d200..1d4ecbcfd 100644 --- a/manywheel/Dockerfile +++ b/manywheel/Dockerfile @@ -157,15 +157,14 @@ FROM cpu_final as rocm_final ARG ROCM_VERSION=3.7 ARG PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} -# Install ROCm -ADD ./common/install_rocm.sh install_rocm.sh -RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh +# No need to install ROCm as base docker image should have full ROCm install +#ADD ./common/install_rocm.sh install_rocm.sh +#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh ADD ./common/install_rocm_drm.sh install_rocm_drm.sh RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh # cmake3 is needed for the MIOpen build RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3 -### The following is now performed beforehand in a new GPU_IMAGE with magma and miopen preinstalled -#ADD ./common/install_rocm_magma.sh install_rocm_magma.sh -#RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh -#ADD ./common/install_miopen.sh install_miopen.sh -#RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh +ADD ./common/install_rocm_magma.sh install_rocm_magma.sh +RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh +ADD ./common/install_miopen.sh install_miopen.sh +RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh index 55abdd1d9..6b1e31c04 100755 --- a/manywheel/build_docker.sh +++ b/manywheel/build_docker.sh @@ -44,7 +44,7 @@ case ${GPU_ARCH_TYPE} in TARGET=rocm_final DOCKER_TAG=rocm${GPU_ARCH_VERSION} LEGACY_DOCKER_IMAGE=${DOCKER_REGISTRY}/pytorch/manylinux-rocm:${GPU_ARCH_VERSION} - GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-magma-miopen-staging + GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)" if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then