Add skipIfRocmArch decorator for Navi skips (#1356)

jataylo · dnikolaev-amd · commit 8a4d1e2c15d9 · 2024-09-13T11:54:04.000Z
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -31,6 +31,7 @@
     freeze_rng_state,
     IS_FBCODE,
     skipIfRocm,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
 )
 from torch.testing._internal.inductor_utils import skipCUDAIf
@@ -52,7 +53,7 @@
         sys.exit(0)
     raise
 
-
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
@@ -336,6 +337,7 @@ def foo(x):
         out_ref.add_(2)
         # self.assertEqual(out_ref, out)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_accuracy_issue1(self):
         class Repro(torch.nn.Module):
             def __init__(self) -> None:
@@ -372,6 +374,7 @@ def forward(self, start_positions: torch.Tensor, x: torch.Tensor):
             assert same_two_models(mod, opt_mod, args), "Dynamo failed"
 
     @config.patch(allow_buffer_reuse=False)
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103461(self):
         def forward(add_1):
             var_mean = torch.ops.aten.var_mean.correction(
@@ -870,6 +873,7 @@ def forward(self, x):
             res2 = jit_func(x)
             self.assertEqual(res1, res2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_issue103481(self):
         def fn(x, y):
             # NOTE: 6 dimensions is important! does not fail for 5 dimensions
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -85,6 +85,7 @@
     skipIfWindows,
     skipIfXpu,
     subtest,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
 )
@@ -119,6 +120,10 @@
 
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
+_desired_test_bases = get_desired_device_type_test_bases()
+RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 
 aten = torch.ops.aten
 
@@ -1794,6 +1799,7 @@ def fn(x):
             # make sure things also work if they aren't unrolled
             self.common(fn, (torch.randn(8, 3),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_sum_low_prec(self):
         # fp16 nyi for cpu
         if self.device == "cpu":
@@ -1804,6 +1810,7 @@ def fn(a):
 
         self.common(fn, ((torch.rand((10, 3, 352, 352), dtype=torch.float16),)))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_prime_size(self):
         def fn(a):
             return torch.max(a), torch.sum(a)
@@ -1815,6 +1822,7 @@ def fn(a):
 
     @skip_if_gpu_halide
     @skipCPUIf(IS_MACOS, "fails on macos")
+    @skipIfRocmArch(NAVI_ARCH)
     def test_multilayer_var(self):
         def fn(a):
             return torch.var(a)
@@ -2966,6 +2974,7 @@ def fn(a, b):
         self.common(fn, (torch.randn(8, 8), torch.randn(8, 8)))
 
     @skip_if_halide  # only 32-bit indexing
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_tensor_reduction(self):
         if not _has_sufficient_memory(self.device, 4.5 * 1024**3):  # 4.5 GiB
             raise unittest.SkipTest("insufficient memory")
@@ -2987,6 +2996,7 @@ def fn(a):
         self.assertEqual(actual, expect)
 
     @skip_if_gpu_halide  # only 32-bit indexing
+    @skipIfRocmArch(NAVI_ARCH)
     def test_large_broadcast_reduction(self):
         if self.device == "cpu":
             raise unittest.SkipTest("Fails on CPU")
@@ -4148,6 +4158,7 @@ def test_conv2d_channels_last(self):
             check_lowp=False,
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_conv2d_backward_channels_last(self):
         def fn(grad_output, inp, weight):
             convolution_backward_8 = torch.ops.aten.convolution_backward.default(
@@ -4932,6 +4943,7 @@ def fn(x, y):
         self.assertEqual(c.stride()[2], 1)
 
     @skip_if_gpu_halide
+    @skipIfRocmArch(NAVI_ARCH)
     def test_std(self):
         def fn(x):
             return (
@@ -4974,6 +4986,7 @@ def test_batch_norm_2d(self):
 
     # From yolov3
     @with_tf32_off
+    @skipIfRocmArch(NAVI_ARCH)
     def test_batch_norm_2d_2(self):
         if self.device == "cpu":
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
@@ -5120,6 +5133,7 @@ def fn(dist, angle):
         self.common(fn, (*inp,))
 
     @skip_if_gpu_halide  # incorrect result on CUDA
+    @skipIfRocmArch(NAVI_ARCH)
     def test_cauchy(self):
         def fn(x, y):
             return torch.sum(1 / (torch.unsqueeze(x, -1) - y))
@@ -6520,6 +6534,7 @@ def fn(a):
         y = fn_compiled(x)
         self.assertTrue(y is not x)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_l1_loss(self):
         def fn(a, b):
             return torch.nn.functional.l1_loss(a, b), torch.nn.functional.mse_loss(a, b)
@@ -6920,6 +6935,7 @@ def fn(x):
             fn, (torch.tensor([1, float("inf"), 2, float("-inf"), float("nan")]),)
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_any(self):
         def fn(x):
             return (
@@ -7686,6 +7702,8 @@ def fn(a, dim, index, b, reduce):
             )
 
     @skip_if_gpu_halide
+    # issue #1150
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
@@ -8693,6 +8711,7 @@ def fn(a, b):
         b = torch.rand(2, 2, 1, 4, 1).int()
         self.common(fn, (a, b))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin1(self):
         def fn(x):
             return (aten.argmax(x), aten.argmin(x))
@@ -8704,6 +8723,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin2(self):
         def fn(x):
             return (
@@ -8715,6 +8735,7 @@ def fn(x):
 
         self.common(fn, (torch.randn([144, 144]),))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_duplicates(self):
         def fn(x):
             return (
@@ -8737,6 +8758,7 @@ def fn(x):
         self.common(fn, (t1,))
 
     @skip_if_halide  # nan behavior
+    @skipIfRocmArch(NAVI_ARCH)
     def test_argmax_argmin_with_nan(self):
         def fn(x):
             return (
@@ -8860,6 +8882,7 @@ def fn(x):
             ],
         )
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_tmp_not_defined_issue1(self):
         def forward(
             primals_3,
@@ -9259,6 +9282,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 else:
                     self.assertEqual(len(inps), 0)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_dtype_mismatch_issue(self):
         def fn(x):
             attn = torch.nn.functional.pad(x, [0, 1])
@@ -12349,6 +12373,7 @@ def test_rnn_compile_safe(self):
 
     class NanCheckerTest(TestCase):
         @config.patch("nan_asserts", True)
+        @skipIfRocmArch(NAVI_ARCH)
         def test_nan_checker_pass(self):
             def f(x):
                 return torch.softmax(x, dim=-1)
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
@@ -30,6 +30,7 @@
 from torch.testing._internal.common_methods_invocations import op_db, skipOps
 from torch.testing._internal.common_utils import (
     dtype_abbrs,
+    IS_NAVI,
     IS_MACOS,
     IS_X86,
     skipCUDAMemoryLeakCheckIf,
@@ -203,6 +204,19 @@ def format_op(op):
     # Tensors are not alike
     inductor_skips["cuda"]["logcumsumexp"] = {f32}
     inductor_skips["cuda"]["special.modified_bessel_i1"] = {f64}
+    if IS_NAVI:
+        inductor_skips["cuda"]["aminmax"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["dist"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["kron"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["masked.var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("max", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"][("min", "reduction_no_dim")] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["nn.functional.conv_transpose3d"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["std_mean"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var"] = {b8, f16, f32, f64, i32, i64}
+        inductor_skips["cuda"]["var_mean"] = {b8, f16, f32, f64, i32, i64}
 
 inductor_expected_failures_single_sample = defaultdict(dict)
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -1279,6 +1279,13 @@ def printErrors(self) -> None:
 IS_X86 = platform.machine() in ('x86_64', 'i386')
 IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
 
+IS_NAVI=False
+if torch.cuda.is_available():
+    prop = torch.cuda.get_device_properties(0)
+    gfx_arch = prop.gcnArchName.split(":")[0]
+    if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
+        IS_NAVI = True
+
 def is_avx512_vnni_supported():
     if sys.platform != 'linux':
         return False
@@ -1754,6 +1761,19 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def skipIfRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):