diff --git a/check_binary.sh b/check_binary.sh index 7298f21b5..15de28194 100755 --- a/check_binary.sh +++ b/check_binary.sh @@ -24,7 +24,8 @@ set -eux -o pipefail # The install root depends on both the package type and the os # All MacOS packages use conda, even for the wheel packages. if [[ "$PACKAGE_TYPE" == libtorch ]]; then - install_root="$pwd" + # NOTE: Only $PWD works on both CentOS and Ubuntu + install_root="$PWD" else py_dot="${DESIRED_PYTHON:0:3}" install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/" @@ -34,24 +35,27 @@ fi ############################################################################### # Check GCC ABI ############################################################################### + +# NOTE [ Building libtorch with old vs. new gcc ABI ] +# +# Packages built with one version of ABI could not be linked against by client +# C++ libraries that were compiled using the other version of ABI. Since both +# gcc ABIs are still common in the wild, we need to support both ABIs. Currently: +# +# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI. +# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI. + echo "Checking that the gcc ABI is what we expect" if [[ "$(uname)" != 'Darwin' ]]; then function is_expected() { - # This commented out logic is what you'd expect if 'devtoolset7' actually - # built with the new GCC ABI, but it doesn't; it always builds with ABI=0. - # When a compiler is added that does build with new ABI, then replace - # devtoolset7 (and the DESIRED_DEVTOOLSET variable) with your new compiler - #if [[ "$DESIRED_DEVTOOLSET" == 'devtoolset7' ]]; then - # if [[ "$1" -gt 0 || "$1" == "ON" ]]; then - # echo 1 - # fi - #else - # if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then - # echo 1 - # fi - #fi - if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then - echo 1 + if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then + if [[ "$1" -gt 0 || "$1" == "ON " ]]; then + echo 1 + fi + else + if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then + echo 1 + fi fi } @@ -90,26 +94,75 @@ if [[ "$(uname)" != 'Darwin' ]]; then fi # We also check that there are [not] cxx11 symbols in libtorch - # TODO this doesn't catch everything. Even when building with the old ABI - # there are 44 symbols in the new ABI in the libtorch library, making this - # check return true. This should actually check that the number of new ABI - # symbols is sufficiently large. - # Also, this is wrong on the old ABI, since there are some cxx11 symbols with - # devtoolset7. - #echo "Checking that symbols in libtorch.so have the right gcc abi" - #libtorch="${install_root}/lib/libtorch.so" - #cxx11_symbols="$(nm "$libtorch" | c++filt | grep __cxx11 | wc -l)" || true - #if [[ "$(is_expected $cxx11_symbols)" != 1 ]]; then - # if [[ "$cxx11_symbols" == 0 ]]; then - # echo "No cxx11 symbols found, but there should be." - # else - # echo "Found cxx11 symbols but there shouldn't be. Dumping symbols" - # nm "$libtorch" | c++filt | grep __cxx11 - # fi - # exit 1 - #else - # echo "cxx11 symbols seem to be in order" - #fi + # + # To check whether it is using cxx11 ABI, check non-existence of symbol: + PRE_CXX11_SYMBOLS=( + "std::basic_string" + "std::list" + ) + # To check whether it is using pre-cxx11 ABI, check non-existence of symbol: + CXX11_SYMBOLS=( + "std::__cxx11::basic_string" + "std::__cxx11::list" + ) + # NOTE: Checking the above symbols in all namespaces doesn't work, because + # devtoolset7 always produces some cxx11 symbols even if we build with old ABI, + # and CuDNN always has pre-cxx11 symbols even if we build with new ABI using gcc 5.4. + # Instead, we *only* check the above symbols in the following namespaces: + LIBTORCH_NAMESPACE_LIST=( + "c10::" + "at::" + "caffe2::" + "torch::" + ) + echo "Checking that symbols in libtorch.so have the right gcc abi" + grep_symbols () { + symbols=("$@") + for namespace in "${LIBTORCH_NAMESPACE_LIST[@]}" + do + for symbol in "${symbols[@]}" + do + nm "$lib" | c++filt | grep " $namespace".*$symbol + done + done + } + check_lib_symbols_for_abi_correctness () { + lib=$1 + echo "lib: " $lib + if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then + num_pre_cxx11_symbols=$(grep_symbols "${PRE_CXX11_SYMBOLS[@]}" | wc -l) || true + echo "num_pre_cxx11_symbols: " $num_pre_cxx11_symbols + if [[ "$num_pre_cxx11_symbols" -gt 0 ]]; then + echo "Found pre-cxx11 symbols but there shouldn't be. Dumping symbols" + grep_symbols "${PRE_CXX11_SYMBOLS[@]}" + exit 1 + fi + num_cxx11_symbols=$(grep_symbols "${CXX11_SYMBOLS[@]}" | wc -l) || true + echo "num_cxx11_symbols: " $num_cxx11_symbols + if [[ "$num_cxx11_symbols" -lt 1000 ]]; then + echo "Didn't find enough cxx11 symbols. Aborting." + exit 1 + fi + else + num_cxx11_symbols=$(grep_symbols "${CXX11_SYMBOLS[@]}" | wc -l) || true + echo "num_cxx11_symbols: " $num_cxx11_symbols + if [[ "$num_cxx11_symbols" -gt 0 ]]; then + echo "Found cxx11 symbols but there shouldn't be. Dumping symbols" + grep_symbols "${CXX11_SYMBOLS[@]}" + exit 1 + fi + num_pre_cxx11_symbols=$(grep_symbols "${PRE_CXX11_SYMBOLS[@]}" | wc -l) || true + echo "num_pre_cxx11_symbols: " $num_pre_cxx11_symbols + if [[ "$num_pre_cxx11_symbols" -lt 1000 ]]; then + echo "Didn't find enough pre-cxx11 symbols. Aborting." + exit 1 + fi + fi + } + libtorch="${install_root}/lib/libtorch.so" + check_lib_symbols_for_abi_correctness $libtorch + + echo "cxx11 symbols seem to be in order" fi # if on Darwin ############################################################################### @@ -160,6 +213,7 @@ fi ############################################################################### if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then # For libtorch testing is done. All further tests require Python + # TODO: We should run those further tests for libtorch as well exit 0 fi python -c 'import torch' diff --git a/conda/.DS_Store b/conda/.DS_Store new file mode 100644 index 000000000..5008ddfcf Binary files /dev/null and b/conda/.DS_Store differ diff --git a/manywheel/build.sh b/manywheel/build.sh index a7c2e474e..f00542d86 100755 --- a/manywheel/build.sh +++ b/manywheel/build.sh @@ -22,8 +22,22 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then fi # Determine CUDA version and architectures to build for -CUDA_VERSION=$(nvcc --version|tail -n1|cut -f5 -d" "|cut -f1 -d",") -echo "CUDA $CUDA_VERSION Detected" +# +# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`, +# because in some cases a single Docker image can have multiple CUDA versions +# on it, and `nvcc --version` might not show the CUDA version we want. +if [[ -n "$DESIRED_CUDA" ]]; then + # cu90, cu92, cu100, cu101 + if [[ ${#DESIRED_CUDA} -eq 4 ]]; then + CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}" + elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then + CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}" + fi + echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA" +else + CUDA_VERSION=$(nvcc --version|tail -n1|cut -f5 -d" "|cut -f1 -d",") + echo "CUDA $CUDA_VERSION Detected" +fi export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX" if [[ $CUDA_VERSION == "9.0" ]]; then @@ -60,13 +74,20 @@ if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then fi mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true +OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` +if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then + LIBGOMP_PATH="/usr/lib64/libgomp.so.1" +elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then + LIBGOMP_PATH="/usr/lib/gcc/x86_64-linux-gnu/5/libgomp.so" +fi + if [[ $CUDA_VERSION == "9.0" ]]; then DEPS_LIST=( "/usr/local/cuda/lib64/libcudart.so.9.0" "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.9.0" "/usr/local/cuda/lib64/libnvrtc-builtins.so" - "/usr/lib64/libgomp.so.1" + "$LIBGOMP_PATH" ) DEPS_SONAME=( @@ -82,7 +103,7 @@ DEPS_LIST=( "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.9.2" "/usr/local/cuda/lib64/libnvrtc-builtins.so" - "/usr/lib64/libgomp.so.1" + "$LIBGOMP_PATH" ) DEPS_SONAME=( @@ -98,7 +119,7 @@ DEPS_LIST=( "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.10.0" "/usr/local/cuda/lib64/libnvrtc-builtins.so" - "/usr/lib64/libgomp.so.1" + "$LIBGOMP_PATH" ) DEPS_SONAME=( @@ -114,7 +135,7 @@ DEPS_LIST=( "/usr/local/cuda/lib64/libnvToolsExt.so.1" "/usr/local/cuda/lib64/libnvrtc.so.10.1" "/usr/local/cuda/lib64/libnvrtc-builtins.so" - "/usr/lib64/libgomp.so.1" + "$LIBGOMP_PATH" ) DEPS_SONAME=( @@ -132,6 +153,12 @@ fi # builder/test.sh requires DESIRED_CUDA to know what tests to exclude export DESIRED_CUDA="$cuda_version_nodot" +# Switch `/usr/local/cuda` to the desired CUDA version +rm -rf /usr/local/cuda || true +ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda +export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130 +export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0 +export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" source $SCRIPTPATH/build_common.sh diff --git a/manywheel/build_common.sh b/manywheel/build_common.sh index 60f91be84..74d7bb7e8 100644 --- a/manywheel/build_common.sh +++ b/manywheel/build_common.sh @@ -20,7 +20,13 @@ retry () { } # TODO move this into the Docker images -retry yum install -q -y zip openssl +OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` +if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then + retry yum install -q -y zip openssl +elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then + retry apt-get update + retry apt-get -y install zip openssl +fi # We use the package name to test the package by passing this to 'pip install' # This is the env variable that setup.py uses to name the package. Note that @@ -97,6 +103,13 @@ if [[ "$DESIRED_PYTHON" == "cp37-cp37m" ]]; then else retry pip install -q numpy==1.11 fi + +if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then + export _GLIBCXX_USE_CXX11_ABI=1 +else + export _GLIBCXX_USE_CXX11_ABI=0 +fi + echo "Calling setup.py bdist at $(date)" time CMAKE_ARGS=${CMAKE_ARGS[@]} \ EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \ @@ -142,9 +155,16 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then echo "$(pushd $pytorch_rootdir && git rev-parse HEAD)" > libtorch/build-hash mkdir -p /tmp/$LIBTORCH_HOUSE_DIR - zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch - cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \ - /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_VARIANT-latest.zip + + if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then + LIBTORCH_ABI="cxx11-abi-" + else + LIBTORCH_ABI= + fi + + zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch + cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \ + /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip fi popd diff --git a/manywheel/build_cpu.sh b/manywheel/build_cpu.sh index cfd3b21d2..8fb407dbf 100755 --- a/manywheel/build_cpu.sh +++ b/manywheel/build_cpu.sh @@ -26,9 +26,15 @@ if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then fi mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true +OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` +if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then + LIBGOMP_PATH="/usr/lib64/libgomp.so.1" +elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then + LIBGOMP_PATH="/usr/lib/gcc/x86_64-linux-gnu/5/libgomp.so" +fi DEPS_LIST=( - "/usr/lib64/libgomp.so.1" + "$LIBGOMP_PATH" ) DEPS_SONAME=( diff --git a/smoke_test.sh b/smoke_test.sh index 7d035fce1..9f207de72 100755 --- a/smoke_test.sh +++ b/smoke_test.sh @@ -48,7 +48,12 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then else libtorch_variant="$LIBTORCH_VARIANT" fi - package_name="libtorch-$libtorch_variant-${NIGHTLIES_DATE_PREAMBLE}${DATE}.zip" + if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then + LIBTORCH_ABI="cxx11-abi-" + else + LIBTORCH_ABI= + fi + package_name="libtorch-$LIBTORCH_ABI$libtorch_variant-${NIGHTLIES_DATE_PREAMBLE}${DATE}.zip" elif [[ "$PACKAGE_TYPE" == *wheel ]]; then package_name='torch' elif [[ "$DESIRED_CUDA" == 'cpu' && "$(uname)" != 'Darwin' ]]; then diff --git a/update_compiler.sh b/update_compiler.sh index 22f7721e1..d04fa1517 100755 --- a/update_compiler.sh +++ b/update_compiler.sh @@ -1,6 +1,8 @@ #!/bin/bash set -ex +# NOTE: This script is called by default on all nightlies. + # Expected to be run on a Docker image built from # https://github.com/pytorch/builder/blob/master/conda/Dockerfile (or the # manywheel equivalent) @@ -10,34 +12,19 @@ set -ex # Why does this file exist? Why not just update the compiler on the base docker # images? # -# So, all the nightlies used to be built on devtoolset3 with the old gcc ABI. -# These packages worked well for most people, but could not be linked against -# by client c++ libraries that were compiled using the new gcc ABI. Since both -# gcc ABIs are still common in the wild, we should be able to support both -# ABIs. Hence, we need a script to alter the compiler on the root docker images -# to configure which ABI we want to build with. +# Answer: Yes we should just update the compiler to devtoolset7 on all the CentOS +# base docker images. There's no reason to keep around devtoolset3 because it's +# not used anymore. # -# So then this script was written to change from devtoolset3 to devtoolset7. It -# turns out that this doesn't actually fix the problem, since devtoolset7 is -# incapable of building with the new gcc ABI. BUT, devtoolset7 /is/ able to +# We use devtoolset7 instead of devtoolset3 because devtoolset7 /is/ able to # build with avx512 instructions, which are needed for fbgemm to get good -# performance. So now this script is called by default on all nightlies. -# -# But we still don't have the new gcc ABI. So what should happen next is -# - Upgrade the compiler on all the base docker images to be devtoolset7. -# There's no reason to keep around devtoolset3. It will never be used. -# - Alter this script to install another compiler toolchain, not a devtoolset#, -# which can build with the new gcc ABI. Then use this script as intended, in -# a parallel suite of new-gcc-ABI nightlies. +# performance. # -# When this script is finally changed to build with the new gcc ABI, then we'll -# need to set this variable manually because -# https://github.com/pytorch/pytorch/blob/master/torch/abi-check.cpp sets the -# ABI to 0 by default. -# ``` export _GLIBCXX_USE_CXX11_ABI=1 ``` -# Note that this probably needs to get set in the .circleci infra that's -# running this, since env variables set in this file are probably discarded. -# ~~~ +# Note that devtoolset7 still *cannot* build with the new gcc ABI +# (see https://bugzilla.redhat.com/show_bug.cgi?id=1546704). Instead, we use +# Ubuntu 16.04 + gcc 5.4 to build with the new gcc ABI, using an Ubuntu 16.04 +# base docker image. +# For details, see NOTE [ Building libtorch with old vs. new gcc ABI ]. # The gcc version should be 4.9.2 right now echo "Initial gcc version is $(gcc --version)"