CONSOLIDATED COMMITS: unit test skips and unskips

pruthvistony · jithunnair-amd · commit b9c976d05eb4 · 2024-11-19T16:53:32.000-06:00
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a)
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
@@ -19,7 +19,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_utils import find_free_port
+from torch.testing._internal.common_utils import find_free_port, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -208,6 +208,7 @@ def test_schedule(self):
         self.assertParses()
 
     @requires_cuda
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_cudagraphs(self):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
         fn_opt(torch.ones(1000, 1000, device="cuda"))
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
@@ -951,7 +951,7 @@ def fn(inp, *args, **kwargs):
                 # (3) encountering this error in PyTorch internals.
                 xfail("index_reduce", "prod"),
                 decorate(
-                    "linalg.householder_product", decorator=runOnRocm
+                    "linalg.householder_product", decorator=skipIfRocm
                 ),  # works on ROCm
                 xfail(
                     # nans
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -33,6 +33,7 @@
     freeze_rng_state,
     IS_FBCODE,
     skipIfRocm,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
 )
 
@@ -59,7 +60,7 @@
         sys.exit(0)
     raise
 
-
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
@@ -28,7 +28,7 @@
     check_model_cuda,
     copy_tests,
 )
-from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
 
 
 importlib.import_module("functorch")
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
@@ -15,7 +15,7 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import expectedFailureXPU
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
-
+from torch.testing._internal.common_utils import skipIfRocm
 
 class TestKernelBenchmark(TestCase):
     device_type = GPU_TYPE
@@ -151,6 +151,7 @@ def f(a, b):
     @expectedFailureXPU
     @config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
     @fresh_inductor_cache()
+    @skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
     def test_mm_triton_kernel_benchmark(self):
         M = 2048
         N = 2432
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -70,6 +70,7 @@
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
     expectedFailureXPU,
+    get_desired_device_type_test_bases,
 )
 from torch.testing._internal.common_dtype import all_types, get_all_dtypes
 from torch.testing._internal.common_utils import (
@@ -85,6 +86,8 @@
     skipIfWindows,
     skipIfXpu,
     subtest,
+    skipIfRocmArch,
+    subtest,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
 )
@@ -119,6 +122,10 @@
 
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
+_desired_test_bases = get_desired_device_type_test_bases()
+RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 
 aten = torch.ops.aten
 
@@ -6943,6 +6950,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
             ),
         )
 
+    @skipIfWindows
+    @skipIfRocm
     def test_roi_align(self):
         if not has_torchvision_roi_align():
             raise unittest.SkipTest("requires torchvision")
@@ -7787,6 +7796,7 @@ def fn(a, dim, index, b, reduce):
             )
 
     @skip_if_gpu_halide
+    # issue #1150
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -32,6 +32,7 @@
     TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
+    skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
@@ -241,6 +242,7 @@ def fn(x, y):
         self.assertEqual(r, opt_r)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_unwrap_storage_didnt_work_repro(self, device):
         def f():
             full = torch.full((), 11)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
@@ -57,6 +57,7 @@
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
+    skipIfRocm,
     skipIfNotMiopenSuggestNHWC,
     skipIfRocmVersionLessThan,
     subtest,
@@ -4081,8 +4082,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         self.assertEqual(grad_weight.shape, weight.shape)
 
     @onlyCUDA
-    @largeTensorTest("40GB")
-    @largeTensorTest("24GB", "cpu")
+    @largeTensorTest('40GB')
+    @largeTensorTest('24GB', 'cpu')
+    # Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
+    @skipIfRocm
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
diff --git a/test/run_test.py b/test/run_test.py
@@ -185,6 +185,9 @@ def __contains__(self, item):
     "distributed/_tensor/test_attention",
 ]
 
+if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
+    ROCM_BLOCKLIST.append("test_typing")
+
 XPU_BLOCKLIST = [
     "test_autograd",
     "profiler/test_cpp_thread",
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -1920,9 +1920,8 @@ def test_graph_capture_oom(self):
             with torch.cuda.graph(torch.cuda.CUDAGraph()):
                 torch.zeros(2**40, device="cuda")
 
-    @unittest.skipIf(
-        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
-    )
+    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     @serialTest()
     def test_repeat_graph_capture_cublas_workspace_memory(self):
         (x, y, z) = 1024, 512, 64
@@ -2878,6 +2877,7 @@ def forward(self, input_dict: dict):
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_graph_make_graphed_callables_same_pool(self):
         torch.manual_seed(5)
         torch.cuda.manual_seed(5)
diff --git a/test/test_fx.py b/test/test_fx.py
@@ -57,6 +57,7 @@
     IS_WINDOWS,
     find_library_location,
     run_tests,
+    skipIfRocm,
     skipIfTorchDynamo,
 )
 from torch.testing._internal.jit_utils import JitTestCase
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -1279,6 +1279,14 @@ def printErrors(self) -> None:
 IS_X86 = platform.machine() in ('x86_64', 'i386')
 IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
 
+def is_navi_arch():
+    if torch.cuda.is_available():
+        prop = torch.cuda.get_device_properties(0)
+        gfx_arch = prop.gcnArchName.split(":")[0]
+        if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
+            return True
+    return False
+
 def is_avx512_vnni_supported():
     if sys.platform != 'linux':
         return False
@@ -1758,6 +1766,19 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def skipIfRocmArch(arch: Tuple[str, ...]):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
+                prop = torch.cuda.get_device_properties(0)
+                if prop.gcnArchName.split(":")[0] in arch:
+                    reason = f"skipIfRocm: test skipped on {arch}"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -4863,7 +4863,11 @@ def _test_ddp_apply_optim_in_backward(
                         # set_to_none for regular optimizer to match in backward
                         # case.
                         optim.zero_grad(set_to_none=True)
-
+        
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "gloo" and HAS_TORCHVISION,
+            "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
+        )
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward(self):
             for optim_cls, init_before in itertools.product(
@@ -4876,6 +4880,10 @@ def test_ddp_apply_optim_in_backward(self):
                         init_before=init_before,
                     )
 
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "gloo" and HAS_TORCHVISION,
+            "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
+        )
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
             for init_before in [True, False]:
diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -613,8 +613,14 @@ def test_invalid_devices(self):
                 )
             ]
 
+        if TEST_WITH_ROCM:
+            errorString = (r"HIP error: invalid device ordinal\n"
+                          r"HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\n"
+                          r"For debugging consider passing AMD_SERIALIZE_KERNEL=3")
+        else:
+            errorString = r"CUDA error: invalid device ordinal"
         with self.assertRaisesRegex(
-            RuntimeError, r"CUDA error: invalid device ordinal"
+            RuntimeError, errorString
         ):
             [
                 m.forward()

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`check_model_cuda,`
`29`	`29`	`copy_tests,`
`30`	`30`	`)`
`31`		`-from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM`
	`31`	`+from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm`
`32`	`32`
`33`	`33`
`34`	`34`	`importlib.import_module("functorch")`
Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,9 @@ def __contains__(self, item):`
`185`	`185`	`"distributed/_tensor/test_attention",`
`186`	`186`	`]`
`187`	`187`
	`188`	`+if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):`
	`189`	`+ ROCM_BLOCKLIST.append("test_typing")`
	`190`	`+`
`188`	`191`	`XPU_BLOCKLIST = [`
`189`	`192`	`"test_autograd",`
`190`	`193`	`"profiler/test_cpp_thread",`
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`IS_WINDOWS,`
`58`	`58`	`find_library_location,`
`59`	`59`	`run_tests,`
	`60`	`+ skipIfRocm,`
`60`	`61`	`skipIfTorchDynamo,`
`61`	`62`	`)`
`62`	`63`	`from torch.testing._internal.jit_utils import JitTestCase`
Original file line number	Diff line number	Diff line change
`@@ -613,8 +613,14 @@ def test_invalid_devices(self):`
`613`	`613`	`)`
`614`	`614`	`]`
`615`	`615`
	`616`	`+ if TEST_WITH_ROCM:`
	`617`	`+ errorString = (r"HIP error: invalid device ordinal\n"`
	`618`	`+ r"HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\n"`
	`619`	`+ r"For debugging consider passing AMD_SERIALIZE_KERNEL=3")`
	`620`	`+ else:`
	`621`	`+ errorString = r"CUDA error: invalid device ordinal"`
`616`	`622`	`with self.assertRaisesRegex(`
`617`		`- RuntimeError, r"CUDA error: invalid device ordinal"`
	`623`	`+ RuntimeError, errorString`
`618`	`624`	`):`
`619`	`625`	`[`
`620`	`626`	`m.forward()`