CONSOLIDATED COMMITS: unit test skips and unskips

pruthvistony · dnikolaev-amd · commit 97f3d54014b2 · 2025-04-24T17:03:00.000Z
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a) (cherry picked from commit b966e44) [rocm6.4_internal_testing] Skip non_standard_bool_values tests (#1880) Fixes SWDEV-509757 (cherry picked from commit 80b4c41) [rocm6.4_internal_testing] [NAVI32] Skipped sdpa_2 test in test_aot_inductor for Navi32 (#1882) The test fails with assertion error "Tensors are not close" After testing I can confirm that this issue is caused by eager mode execution specific to navi32 during the test_sdpa_2 run. Made a cross reference between navi31, navi32 and mi300. AOTInductor results are all the exact same for all of the archs, only the eager mode fails here for navi32 with 1.5% difference in tensor values from the gpu run. I assume that this happens due to fp16-32-16 conversions in eager mode or missing some if-statements for navi32 specifically. Simple reproducer to check the values for cpu/gpu/eager/aoti runs. [gfx1101_test_sdpa_2_issue_reproducer.txt](https://github.com/user-attachments/files/18676367/gfx1101_test_sdpa_2_issue_reproducer.txt) (cherry picked from commit 896c789) Fixed rocm skip import issue (#1949) skip_if_rocm does not exist in torch/testing/_internal/common_distributed.py. Use skipIfRocm from torch/testing/_internal/common_utils.py instead. (cherry picked from commit cfb673e) Skip certain unit tests on NAVI (#1950) This PR is to skip certain unit tests on NAVI only. Fixes SWDEV-509011 - test_sac_ilp.py::TestSACILP::test_sac_ilp_case1 Fixes SWDEV-509311 - test_max_autotune.py::TestMaxAutotune::test_non_contiguous_input_addmm Fixes SWDEV-510738 test_fsdp_sharded_grad_scaler.py::TestShardedGradScalerParityWithDDP::test_sharded_grad_scaler_found_inf (cherry picked from commit e86291a)
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
@@ -20,6 +20,7 @@
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import (
     MI300_ARCH,
+    NAVI_ARCH,
     run_tests,
     skipIfRocmArch,
     skipIfTorchDynamo,
@@ -137,6 +138,7 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(NAVI_ARCH)
     def test_sac_ilp_case1(self):
         """
         This is a case where the memory budget is either binding or too tight,
diff --git a/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py b/test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py
@@ -36,6 +36,8 @@
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
     TestCase,
+    NAVI_ARCH,
+    skipIfRocmArch,
 )
 
 
@@ -236,6 +238,7 @@ def _build_model_and_optim(
         return model, optim, ref_model, ref_optim
 
     @skip_if_lt_x_gpu(2)
+    @skipIfRocmArch(NAVI_ARCH)
     def test_sharded_grad_scaler_found_inf(self):
         self.run_subtests(
             {
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
@@ -20,7 +20,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_utils import find_free_port
+from torch.testing._internal.common_utils import find_free_port, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
 
@@ -267,6 +267,7 @@ def test_schedule(self):
         self.assertParses()
 
     @requires_cuda
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_cudagraphs(self):
         fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
         fn_opt(torch.ones(1000, 1000, device="cuda"))
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
@@ -940,7 +940,7 @@ def fn(inp, *args, **kwargs):
                 # (3) encountering this error in PyTorch internals.
                 xfail("index_reduce", "prod"),
                 decorate(
-                    "linalg.householder_product", decorator=runOnRocm
+                    "linalg.householder_product", decorator=skipIfRocm
                 ),  # works on ROCm
                 xfail(
                     # nans
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -45,8 +45,10 @@
     IS_WINDOWS,
     parametrize,
     skipIfRocm,
+    skipIfRocmArch,
     skipIfXpu,
     TEST_WITH_ROCM,
+    NAVI32_ARCH,
 )
 from torch.testing._internal.custom_tensor import CustomTensorPlainOut
 from torch.testing._internal.inductor_utils import GPU_TYPE
@@ -1016,6 +1018,8 @@ def forward(self, q, k, v):
         )
         self.check_model(Model(), example_inputs)
 
+    # Eager mode produces incorrect tensor values for navi32 during this test
+    @skipIfRocmArch(NAVI32_ARCH)
     @unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
     @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
     def test_sdpa_2(self):
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -37,6 +37,8 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
+    skipIfRocm,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfPy312Plus,
@@ -73,7 +75,7 @@
         sys.exit(0)
     raise
 
-
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 TestCase = test_torchinductor.TestCase
 ToTuple = test_torchinductor.ToTuple
 check_model_cuda = test_torchinductor.check_model_cuda
diff --git a/test/inductor/test_inductor_freezing.py b/test/inductor/test_inductor_freezing.py
@@ -28,7 +28,7 @@
     check_model_gpu,
     copy_tests,
 )
-from torch.testing._internal.common_utils import TEST_WITH_ROCM
+from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
 
 
 importlib.import_module("functorch")
diff --git a/test/inductor/test_kernel_benchmark.py b/test/inductor/test_kernel_benchmark.py
@@ -16,7 +16,7 @@
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import xfailIfSM89
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
-
+from torch.testing._internal.common_utils import skipIfRocm
 
 class TestKernelBenchmark(TestCase):
     device_type = GPU_TYPE
@@ -167,6 +167,7 @@ def f(a, b):
         max_autotune=True, max_autotune_gemm_backends="TRITON", shape_padding=False
     )
     @fresh_inductor_cache()
+    @skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
     def test_mm_triton_kernel_benchmark(self):
         M = 2048
         N = 2432
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -34,6 +34,8 @@
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
+    NAVI_ARCH,
+    skipIfRocmArch,
 )
 from torch.utils._triton import has_triton_tma_device
 
@@ -1000,6 +1002,7 @@ def f(x, y):
         act = f(x, y)
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_addmm(self):
         b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
         x = rand_strided(
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -79,6 +79,7 @@
 from torch.testing._internal.common_device_type import (
     expectedFailureXPU,
     largeTensorTest,
+    get_desired_device_type_test_bases,
 )
 from torch.testing._internal.common_dtype import all_types, get_all_dtypes
 from torch.testing._internal.common_quantization import (
@@ -98,6 +99,8 @@
     skipIfWindows,
     skipIfXpu,
     subtest,
+    skipIfRocmArch,
+    subtest,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfS390X,
@@ -141,6 +144,10 @@
 
 
 HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
+_desired_test_bases = get_desired_device_type_test_bases()
+RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
+RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
+NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
 
 if TEST_WITH_ROCM:
     torch._inductor.config.force_layout_optimization = 1
@@ -7579,6 +7586,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
             ),
         )
 
+    @skipIfWindows
+    @skipIfRocm
     def test_roi_align(self):
         if not has_torchvision_roi_align():
             raise unittest.SkipTest("requires torchvision")
@@ -8428,6 +8437,7 @@ def fn(a, dim, index, b, reduce):
             )
 
     @skip_if_gpu_halide
+    # issue #1150
     def test_dense_mask_index(self):
         r"""
         There will be a little difference for reduce order between aten and inductor
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -33,6 +33,7 @@
     TEST_CUDA_MEM_LEAK_CHECK,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
+    skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
@@ -246,6 +247,7 @@ def fn(x, y):
         self.assertEqual(r, opt_r)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_unwrap_storage_didnt_work_repro(self, device):
         def f():
             full = torch.full((), 11)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
@@ -50,6 +50,7 @@
     parametrize as parametrize_test,
     run_tests,
     set_default_dtype,
+    skipIfRocm,
     skipIfNotMiopenSuggestNHWC,
     skipIfRocmVersionLessThan,
     subtest,
@@ -4046,8 +4047,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         self.assertEqual(grad_weight.shape, weight.shape)
 
     @onlyCUDA
-    @largeTensorTest("40GB")
-    @largeTensorTest("24GB", "cpu")
+    @largeTensorTest('40GB')
+    @largeTensorTest('24GB', 'cpu')
+    # Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
+    @skipIfRocm
     def test_conv3d_64bit_indexing(self, device):
         x = torch.rand(1, 32, 512, 512, 256)
         m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)
diff --git a/test/run_test.py b/test/run_test.py
@@ -168,7 +168,6 @@ def __contains__(self, item):
 
 ROCM_BLOCKLIST = [
     "distributed/rpc/test_faulty_agent",
-    "distributed/rpc/test_tensorpipe_agent",
     "distributed/rpc/test_share_memory",
     "distributed/rpc/cuda/test_tensorpipe_agent",
     "test_determination",
@@ -177,6 +176,9 @@ def __contains__(self, item):
     "test_jit_cuda_fuser",
 ]
 
+if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
+    ROCM_BLOCKLIST.append("test_typing")
+    
 S390X_BLOCKLIST = [
     # these tests fail due to various reasons
     "dynamo/test_misc",
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -2247,9 +2247,8 @@ def test_graph_capture_oom(self):
             with torch.cuda.graph(torch.cuda.CUDAGraph()):
                 torch.zeros(2**40, device="cuda")
 
-    @unittest.skipIf(
-        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
-    )
+    @unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     @serialTest()
     @setBlasBackendsToDefaultFinally
     def test_repeat_graph_capture_cublas_workspace_memory(self):
@@ -3206,6 +3205,7 @@ def forward(self, input_dict: dict):
     @unittest.skipIf(
         not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
     )
+    @skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
     def test_graph_make_graphed_callables_same_pool(self):
         torch.manual_seed(5)
         torch.cuda.manual_seed(5)
diff --git a/test/test_fx.py b/test/test_fx.py
@@ -67,6 +67,7 @@
     IS_MACOS,
     IS_WINDOWS,
     run_tests,
+    skipIfRocm,
     skipIfTorchDynamo,
 )
 from torch.testing._internal.jit_utils import JitTestCase
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1422,6 +1422,20 @@ def test_complex_half_reference_testing(self, device, dtype, op):
 
     @ops(op_db, allowed_dtypes=(torch.bool,))
     def test_non_standard_bool_values(self, device, dtype, op):
+        if TEST_WITH_ROCM and "cuda" in device:
+            rocm_blocklist = [
+                "test_non_standard_bool_values_masked_scatter_cuda_bool",
+                "test_non_standard_bool_values_nn_functional_unfold_cuda_bool",  # only in rocm6.4_internal_testing
+                "test_non_standard_bool_values_put_cuda_bool",
+                "test_non_standard_bool_values_scatter_add_cuda_bool",
+                "test_non_standard_bool_values_scatter_cuda_bool",
+                "test_non_standard_bool_values_scatter_reduce_sum_cuda_bool",
+                "test_non_standard_bool_values_tril_cuda_bool",
+                "test_non_standard_bool_values_triu_cuda_bool",
+            ]
+            if self._testMethodName in rocm_blocklist:
+                self.skipTest("Failed on ROCm")
+
         # Test boolean values other than 0x00 and 0x01 (gh-54789)
         def convert_boolean_tensors(x):
             if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
@@ -102,7 +102,8 @@
     has_pytest = False
 
 
-MI300_ARCH = ("gfx942",)
+MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
 
 
 def freeze_rng_state(*args, **kwargs):
@@ -1349,6 +1350,16 @@ def printErrors(self) -> None:
 IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
 IS_S390X = platform.machine() == "s390x"
 
+NAVI32_ARCH = "gfx1101"
+
+def is_navi_arch():
+    if torch.cuda.is_available():
+        prop = torch.cuda.get_device_properties(0)
+        gfx_arch = prop.gcnArchName.split(":")[0]
+        if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
+            return True
+    return False
+
 def is_avx512_vnni_supported():
     if sys.platform != 'linux':
         return False
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -4816,7 +4816,11 @@ def _test_ddp_apply_optim_in_backward(
                         # set_to_none for regular optimizer to match in backward
                         # case.
                         optim.zero_grad(set_to_none=True)
-
+        
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "gloo" and HAS_TORCHVISION,
+            "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
+        )
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward(self):
             for optim_cls, init_before in itertools.product(
@@ -4829,6 +4833,10 @@ def test_ddp_apply_optim_in_backward(self):
                         init_before=init_before,
                     )
 
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "gloo" and HAS_TORCHVISION,
+            "Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
+        )
         @skip_if_lt_x_gpu(2)
         def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
             for init_before in [True, False]:
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -38,6 +38,7 @@
     load_tests,
     skip_but_pass_in_sandcastle_if,
     get_cycles_per_ms,
+    skipIfRocm,
 )
 
 from torch.testing._internal.dist_utils import (
@@ -5052,6 +5053,7 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
 
     # Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
     @skip_if_lt_x_gpu(2)
+    @skipIfRocm
     @dist_init(setup_rpc=False)
     def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
         initialize_pg(self.file_init_method, self.rank, self.world_size)

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,8 @@`
`36`	`36`	`run_tests,`
`37`	`37`	`TEST_WITH_DEV_DBG_ASAN,`
`38`	`38`	`TestCase,`
	`39`	`+ NAVI_ARCH,`
	`40`	`+ skipIfRocmArch,`
`39`	`41`	`)`
`40`	`42`
`41`	`43`
`@@ -236,6 +238,7 @@ def _build_model_and_optim(`
`236`	`238`	`return model, optim, ref_model, ref_optim`
`237`	`239`
`238`	`240`	`@skip_if_lt_x_gpu(2)`
	`241`	`+ @skipIfRocmArch(NAVI_ARCH)`
`239`	`242`	`def test_sharded_grad_scaler_found_inf(self):`
`240`	`243`	`self.run_subtests(`
`241`	`244`	`{`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@`
`28`	`28`	`check_model_gpu,`
`29`	`29`	`copy_tests,`
`30`	`30`	`)`
`31`		`-from torch.testing._internal.common_utils import TEST_WITH_ROCM`
	`31`	`+from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm`
`32`	`32`
`33`	`33`
`34`	`34`	`importlib.import_module("functorch")`
Original file line number	Diff line number	Diff line change
`@@ -67,6 +67,7 @@`
`67`	`67`	`IS_MACOS,`
`68`	`68`	`IS_WINDOWS,`
`69`	`69`	`run_tests,`
	`70`	`+ skipIfRocm,`
`70`	`71`	`skipIfTorchDynamo,`
`71`	`72`	`)`
`72`	`73`	`from torch.testing._internal.jit_utils import JitTestCase`