enable unit tests and other changes (pytorch#10266)

iotamudelta · facebook-github-bot · commit a38b572de3de · 2018-08-06T14:54:01.000-07:00
Summary: This PR for the ROCm target does the following: * enable some unit tests on ROCm * fix a missing static_cast that breaks BatchNorm call on ROCm * fix BatchNorm to work on ROCm w/ ROCm warp sizes etc * improve the pyhipify script by introducing kernel scope to some transpilations and other improvements * fix a linking issue on ROCm * for more unit test sets: mark currently broken tests broken (to be fixed) * enable THINLTO (phase one) to parallelize linking * address the first failing of the elementwise kernel by removing non-working ROCm specialization Pull Request resolved: pytorch#10266 Differential Revision: D9184178 Pulled By: ezyang fbshipit-source-id: 03bcd1fe4ca4dd3241f09634dbd42b6a4c350297
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
@@ -30,6 +30,7 @@ cmake --version
 pip install -r requirements.txt || true
 
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # This is necessary in order to cross compile (or else we'll have missing GPU device).
   export MAX_JOBS=4
   # This is necessary in order to cross compile (or else we'll have missing GPU device).
   export HCC_AMDGPU_TARGET=gfx900
@@ -41,7 +42,12 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
 
   # This environment variable enabled HCC Optimizations that speed up the linking stage.
   # https://github.com/RadeonOpenCompute/hcc#hcc-with-thinlto-linking
-  # export KMTHINLTO=1
+  export KMTHINLTO=1
+
+  # Need the libc++1 and libc++abi1 libraries to allow torch._C to load at runtime
+  sudo apt-get install libc++1
+  sudo apt-get install libc++abi1
+
   python tools/amd_build/build_pytorch_amd.py
   USE_ROCM=1 python setup.py install --user
   exit 0
diff --git a/.jenkins/pytorch/disabled-configs.txt b/.jenkins/pytorch/disabled-configs.txt
@@ -3,5 +3,3 @@
 # fail.  You can use this to temporarily reserve a test name to
 # turn on CI side before PyTorch repository supports it.  This
 # file has the same format as .jenkins/enabled-configs.txt
-
-py2-clang3.8-rocm1.7.1-ubuntu16.04-test
diff --git a/.jenkins/pytorch/enabled-configs.txt b/.jenkins/pytorch/enabled-configs.txt
@@ -41,3 +41,4 @@ pytorch-docker-build-test
 short-perf-test-cpu
 short-perf-test-gpu
 py2-clang3.8-rocm1.7.1-ubuntu16.04-build
+py2-clang3.8-rocm1.7.1-ubuntu16.04-test
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -74,7 +74,7 @@ test_python_all_except_nn() {
 
 test_aten() {
   # Test ATen
-  if [[ "$BUILD_ENVIRONMENT" != *asan* ]]; then
+  if ([[ "$BUILD_ENVIRONMENT" != *asan* ]] && [[ "$BUILD_ENVIRONMENT" != *rocm* ]]); then
     echo "Running ATen tests with pytorch lib"
     TORCH_LIB_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/torch/lib
     # NB: the ATen test binaries don't have RPATH set, so it's necessary to
@@ -101,7 +101,7 @@ test_torchvision() {
   # this should be a transient requirement...)
   # See https://github.com/pytorch/pytorch/issues/7525
   #time python setup.py install
-  pip install .
+  pip install --user .
   popd
 }
 
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
@@ -28,11 +28,7 @@ namespace at { namespace native {
 
 template<int nt, int vt, typename func_t>
 __launch_bounds__(nt, 4)
-#ifdef __HIP_PLATFORM_HCC__
-__global__ void elementwise_kernel(int N, const func_t& f) {
-#else
 __global__ void elementwise_kernel(int N, func_t f) {
-#endif
   int tid = threadIdx.x;
   int nv = nt * vt;
   int idx = nv * blockIdx.x + tid;
diff --git a/aten/src/THCUNN/BatchNormalization.cu b/aten/src/THCUNN/BatchNormalization.cu
@@ -7,14 +7,26 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#if defined(__HIP_PLATFORM_HCC__)
+const int WARP_SIZE = 64;
+#else
 const int WARP_SIZE = 32;
+#endif
 
 // The maximum number of threads in a block
+#if defined(__HIP_PLATFORM_HCC__)
+const int MAX_BLOCK_SIZE = 256;
+#else
 const int MAX_BLOCK_SIZE = 512;
+#endif
 
 // Number of threads in a block given an input size up to MAX_BLOCK_SIZE
 static int getNumThreads(int nElem) {
+#if defined(__HIP_PLATFORM_HCC__)
+  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
+#else
   int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+#endif
   for (int i = 0; i != 5; ++i) {
     if (nElem <= threadSizes[i]) {
       return threadSizes[i];
@@ -116,7 +128,7 @@ __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
   sum = warpSum(sum);
 
   // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
+  __shared__ T shared[WARP_SIZE];
   __syncthreads();
   if (threadIdx.x % WARP_SIZE == 0) {
     shared[threadIdx.x / WARP_SIZE] = sum;
diff --git a/aten/src/THCUNN/generic/BatchNormalization.cu b/aten/src/THCUNN/generic/BatchNormalization.cu
@@ -64,7 +64,7 @@ void THNN_(BatchNormalization_updateOutput)(
     dim3 blocks(input.getSize(1));
     dim3 threads(getNumThreads(input.getSize(2)));
     BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
-      input, output, weight, bias, eps, momentum, runningMean, runningVar,
+      input, output, weight, bias, static_cast<accreal>(eps), static_cast<accreal>(momentum), runningMean, runningVar,
       saveMean, saveStd);
   }
   THCudaCheck(cudaGetLastError());
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -341,13 +341,24 @@ endif()
 # ---[ Caffe2 HIP sources.
 if(USE_ROCM)
   # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
-  IF(BUILD_ATEN)
-     HIP_INCLUDE_DIRECTORIES(${Caffe2_HIP_INCLUDES})
-  ENDIF()
+  if(BUILD_ATEN)
+    # Get Compile Definitions from the directory (FindHIP.CMake bug)
+    get_directory_property(MY_DEFINITIONS COMPILE_DEFINITIONS)
+    if(MY_DEFINITIONS)
+      foreach(_item ${MY_DEFINITIONS})
+        LIST(APPEND HIP_HCC_FLAGS "-D${_item}")
+      endforeach()
+    endif()
+
+    # Call again since Caffe2_HIP_INCLUDES is extended with ATen include dirs.
+    hip_include_directories(${Caffe2_HIP_INCLUDES})
+  endif()
   IF(BUILD_CAFFE2)
      set_source_files_properties(${Caffe2_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
   ENDIF()
-  hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS})
+
+  # FindHIP.CMake checks if the SHARED flag is set and adds extra logic accordingly.
+  hip_add_library(caffe2_hip SHARED ${Caffe2_HIP_SRCS})
 
   # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added.
   set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS})
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
@@ -111,6 +111,8 @@ IF(HIP_FOUND)
   set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
   set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
   set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
+  SET(CMAKE_HCC_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+  SET(CMAKE_HCC_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.###
 
   set(rocrand_DIR ${ROCRAND_PATH}/lib/cmake/rocrand)
diff --git a/test/common.py b/test/common.py
@@ -98,13 +98,23 @@ def _check_module_exists(name):
     import numpy
 
 
+def skipIfRocm(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_WITH_ROCM:
+            raise unittest.SkipTest("test doesn't currently work on the ROCm stack")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+
 def skipIfNoLapack(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
         try:
             fn(*args, **kwargs)
         except Exception as e:
-            if 'Lapack library not found' in e.args[0]:
+            if 'Lapack library not found' in repr(e):
                 raise unittest.SkipTest('Compiled without Lapack')
             raise
     return wrapper
diff --git a/test/run_test.py b/test/run_test.py
@@ -13,6 +13,7 @@
 
 import torch
 from torch.utils import cpp_extension
+from common import TEST_WITH_ROCM
 
 TESTS = [
     'autograd',
@@ -38,6 +39,25 @@
     'distributed',
 ]
 
+ROCM_BLACKLIST = [
+    'c10d',
+    'cpp_extensions',
+    'cuda',
+    'dataloader',
+    'distributed',
+    'distributions',
+    'indexing',
+    'jit',
+    'legacy_nn',
+    'multiprocessing',
+    'nccl',
+    'nn',
+    'optim',
+    'sparse',
+    'torch',
+    'utils',
+]
+
 DISTRIBUTED_TESTS_CONFIG = {
     'tcp': {
         'WORLD_SIZE': '3'
@@ -303,6 +323,9 @@ def get_selected_tests(options):
 
         selected_tests = exclude_tests(WINDOWS_BLACKLIST, selected_tests, 'on Windows')
 
+    elif TEST_WITH_ROCM:
+        selected_tests = exclude_tests(ROCM_BLACKLIST, selected_tests, 'on ROCm')
+
     return selected_tests
 
 
diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -15,7 +15,7 @@
 from torch.autograd.function import once_differentiable
 from torch.autograd.profiler import profile
 from common import TEST_MKL, TestCase, run_tests, skipIfNoLapack, \
-    suppress_warnings, TEST_WITH_ROCM
+    suppress_warnings, TEST_WITH_ROCM, skipIfRocm
 from torch.autograd import Variable, Function, detect_anomaly
 from torch.autograd.function import InplaceFunction
 from torch.testing import make_non_contiguous, randn_like
@@ -975,6 +975,7 @@ def test_no_requires_grad_inplace(self):
         with self.assertRaises(RuntimeError):
             b.add_(5)
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_requires_grad_factory(self):
         x = torch.randn(2, 3)
         fns = [torch.ones_like, torch.testing.randn_like]
@@ -1374,6 +1375,7 @@ def __del__(self):
             Variable(torch.randn(10, 10), _grad_fn=CollectOnDelete())
 
     @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_unused_output_gpu(self):
         from torch.nn.parallel._functions import Broadcast
         x = Variable(torch.randn(5, 5).float().cuda(), requires_grad=True)
@@ -1402,6 +1404,7 @@ def backward(ctx, grad_output):
         self.assertEqual(device[0], 1)
 
     @unittest.skipIf(torch.cuda.device_count() < 2, "no multi-GPU")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_inputbuffer_add_multigpu(self):
         input = torch.randn(1).cuda(0).requires_grad_()
         output = input.cuda(1) + input.cuda(1)
@@ -1451,6 +1454,7 @@ def test_detach_base(self):
         self.assertIsNotNone(view.grad_fn)
         self.assertIs(view._base, x)
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def _test_type_conversion_backward(self, t, ):
         fvar = Variable(t(torch.randn(5, 5).float()), requires_grad=True)
         fvar.double().sum().backward()
@@ -1910,6 +1914,7 @@ def test_cat_empty(self):
                               lambda a, b: torch.cat((a, b)),
                               True, f_args_variable, f_args_tensor)
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_potrf(self):
         root = Variable(torch.tril(torch.rand(S, S)), requires_grad=True)
 
@@ -2069,6 +2074,7 @@ def run_test(input_size, exponent):
         run_test((10, 10), torch.zeros(10, 10))
         run_test((10,), 0)
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_pinverse(self):
         # Why is pinverse tested this way, and not ordinarily as other linear algebra methods?
         # 1. Pseudo-inverses are not generally continuous, which means that they are not differentiable
@@ -2181,6 +2187,7 @@ def test_where_functional(self):
         self._test_where_functional(lambda t: t)
 
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_where_functional_cuda(self):
         self._test_where_functional(lambda t: t.cuda())
 
@@ -2450,6 +2457,7 @@ def backward(ctx, gO):
                     out.backward()
             self.assertIn('MyFunc.apply', str(w[0].message))
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_symeig_no_eigenvectors(self):
         A = torch.tensor([[1., 2.], [2., 4.]], dtype=torch.float32, requires_grad=True)
         w, v = torch.symeig(A, eigenvectors=False)
@@ -3079,10 +3087,10 @@ class dont_convert(tuple):
     ('svd', lambda: random_fullrank_matrix_distinct_singular_value(M), NO_ARGS,
      'large', NO_ARGS, [skipIfNoLapack]),
     ('gesv', (S, S), ((S, S),), '', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack]),
-    ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack]),
+    ('gesv', (S, S, S), ((S, S, S),), 'batched', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (2, 3, S, S), ((2, 3, S, S),), 'batched_dims', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (2, 2, S, S), ((1, S, S),), 'batched_broadcast_A', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
+    ('gesv', (1, S, S), ((2, 2, S, S),), 'batched_broadcast_b', NO_ARGS, [skipIfNoLapack, skipIfRocm]),
     ('fill_', (S, S, S), (1,), 'number'),
     ('fill_', (), (1,), 'number_scalar'),
     # FIXME: we should compute the derivative w.r.t torch.tensor(1)
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -994,6 +994,7 @@ def test_ge_optimized(self):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_ge_cuda(self):
         self.run_ge_tests(True, True)
 
@@ -1030,6 +1031,7 @@ def foo(a):
 
     @unittest.skipIf(IS_WINDOWS, "NYI: fuser support for Windows")
     @unittest.skipIf(not RUN_CUDA, "calls .cuda()")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_traced_module(self):
         class Model(nn.Module):
             def __init__(self, num_features, num_layers):
diff --git a/test/test_torch.py b/test/test_torch.py
@@ -22,7 +22,7 @@
 from torch import multiprocessing as mp
 from common import TestCase, iter_indices, TEST_NUMPY, TEST_SCIPY, TEST_MKL, \
     TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
-    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN
+    IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ROCM
 from multiprocessing.reduction import ForkingPickler
 
 if TEST_NUMPY:
@@ -724,6 +724,7 @@ def test_norm(self):
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_norm_cuda(self):
         self._test_norm(self, device='cuda')
 
@@ -3279,6 +3280,7 @@ def test_topk_arguments(self):
         self.assertRaises(TypeError, lambda: q.topk(4, True))
 
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_topk_noncontiguous_gpu(self):
         t = torch.randn(20, device="cuda")[::2]
         top1, idx1 = t.topk(5)
@@ -7231,6 +7233,7 @@ def test_serialize_device(self):
             self.assertEqual(device, device_copied)
 
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_half_tensor_cuda(self):
         x = torch.randn(5, 5).half()
         self.assertEqual(x.cuda(), x)
@@ -7542,6 +7545,7 @@ def test_from_file(self):
             t2.fill_(rnum)
             self.assertEqual(t1, t2, 0)
 
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_print(self):
         default_type = torch.Tensor().type()
         for t in torch._tensor_classes:
@@ -7706,6 +7710,7 @@ def test_empty_like(self):
             self.assertEqual(torch.empty_like(a).type(), a.type())
 
     @unittest.skipIf(not torch.cuda.is_available(), 'no CUDA')
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_pin_memory(self):
         x = torch.randn(3, 5)
         self.assertFalse(x.is_pinned())
@@ -7873,6 +7878,7 @@ def test_from_numpy(self):
         self.assertRaises(ValueError, lambda: torch.from_numpy(x))
 
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    @unittest.skipIf(TEST_WITH_ROCM, "test doesn't currently work on the ROCm stack")
     def test_ctor_with_numpy_array(self):
         dtypes = [
             np.double,
diff --git a/tools/amd_build/build_pytorch_amd.py b/tools/amd_build/build_pytorch_amd.py
@@ -1,4 +1,3 @@
-"""Requires the hipify-python.py script (https://github.com/ROCm-Developer-Tools/pyHIPIFY)."""
 import shutil
 import subprocess
 import os
@@ -8,6 +7,7 @@
 
 amd_build_dir = os.path.dirname(os.path.realpath(__file__))
 proj_dir = os.path.dirname(os.path.dirname(amd_build_dir))
+
 includes = [
     "aten/*",
     "torch/*"
@@ -16,7 +16,7 @@
 # List of operators currently disabled
 yaml_file = os.path.join(amd_build_dir, "disabled_features.yaml")
 
-# Apply patch files.
+# Apply patch files in place.
 patch_folder = os.path.join(amd_build_dir, "patches")
 for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
     subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
diff --git a/tools/amd_build/pyHIPIFY/hipify-python.py b/tools/amd_build/pyHIPIFY/hipify-python.py
diff --git a/tools/build_pytorch_libs.sh b/tools/build_pytorch_libs.sh
diff --git a/tools/setup_helpers/rocm.py b/tools/setup_helpers/rocm.py

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ void THNN_(BatchNormalization_updateOutput)(`
`64`	`64`	`dim3 blocks(input.getSize(1));`
`65`	`65`	`dim3 threads(getNumThreads(input.getSize(2)));`
`66`	`66`	`BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(`
`67`		`- input, output, weight, bias, eps, momentum, runningMean, runningVar,`
	`67`	`+ input, output, weight, bias, static_cast<accreal>(eps), static_cast<accreal>(momentum), runningMean, runningVar,`
`68`	`68`	`saveMean, saveStd);`
`69`	`69`	`}`
`70`	`70`	`THCudaCheck(cudaGetLastError());`