Skip to content

Commit 97f3d54

Browse files
pruthvistonydnikolaev-amd
authored andcommitted
CONSOLIDATED COMMITS: unit test skips and unskips
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a) (cherry picked from commit b966e44) [rocm6.4_internal_testing] Skip non_standard_bool_values tests (#1880) Fixes SWDEV-509757 (cherry picked from commit 80b4c41) [rocm6.4_internal_testing] [NAVI32] Skipped sdpa_2 test in test_aot_inductor for Navi32 (#1882) The test fails with assertion error "Tensors are not close" After testing I can confirm that this issue is caused by eager mode execution specific to navi32 during the test_sdpa_2 run. Made a cross reference between navi31, navi32 and mi300. AOTInductor results are all the exact same for all of the archs, only the eager mode fails here for navi32 with 1.5% difference in tensor values from the gpu run. I assume that this happens due to fp16-32-16 conversions in eager mode or missing some if-statements for navi32 specifically. Simple reproducer to check the values for cpu/gpu/eager/aoti runs. [gfx1101_test_sdpa_2_issue_reproducer.txt](https://github.com/user-attachments/files/18676367/gfx1101_test_sdpa_2_issue_reproducer.txt) (cherry picked from commit 896c789) Fixed rocm skip import issue (#1949) skip_if_rocm does not exist in torch/testing/_internal/common_distributed.py. Use skipIfRocm from torch/testing/_internal/common_utils.py instead. (cherry picked from commit cfb673e) Skip certain unit tests on NAVI (#1950) This PR is to skip certain unit tests on NAVI only. Fixes SWDEV-509011 - test_sac_ilp.py::TestSACILP::test_sac_ilp_case1 Fixes SWDEV-509311 - test_max_autotune.py::TestMaxAutotune::test_non_contiguous_input_addmm Fixes SWDEV-510738 test_fsdp_sharded_grad_scaler.py::TestShardedGradScalerParityWithDDP::test_sharded_grad_scaler_found_inf (cherry picked from commit e86291a)
1 parent 8a7fd64 commit 97f3d54

19 files changed

+82
-13
lines changed

test/distributed/_tools/test_sac_ilp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from torch.testing._internal.common_cuda import TEST_CUDA
2121
from torch.testing._internal.common_utils import (
2222
MI300_ARCH,
23+
NAVI_ARCH,
2324
run_tests,
2425
skipIfRocmArch,
2526
skipIfTorchDynamo,
@@ -137,6 +138,7 @@ def _collect_module_info_with_fake_tensor_mode(self) -> ModuleInfo:
137138
@skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/115653")
138139
@unittest.skipIf(not TEST_CUDA, "CUDA not available")
139140
@skipIfRocmArch(MI300_ARCH)
141+
@skipIfRocmArch(NAVI_ARCH)
140142
def test_sac_ilp_case1(self):
141143
"""
142144
This is a case where the memory budget is either binding or too tight,

test/distributed/fsdp/test_fsdp_sharded_grad_scaler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
run_tests,
3737
TEST_WITH_DEV_DBG_ASAN,
3838
TestCase,
39+
NAVI_ARCH,
40+
skipIfRocmArch,
3941
)
4042

4143

@@ -236,6 +238,7 @@ def _build_model_and_optim(
236238
return model, optim, ref_model, ref_optim
237239

238240
@skip_if_lt_x_gpu(2)
241+
@skipIfRocmArch(NAVI_ARCH)
239242
def test_sharded_grad_scaler_found_inf(self):
240243
self.run_subtests(
241244
{

test/dynamo/test_structured_trace.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from torch._inductor.test_case import TestCase
2121
from torch._logging._internal import TorchLogsFormatter
2222
from torch.nn.parallel import DistributedDataParallel as DDP
23-
from torch.testing._internal.common_utils import find_free_port
23+
from torch.testing._internal.common_utils import find_free_port, skipIfRocm
2424
from torch.testing._internal.inductor_utils import HAS_CUDA
2525

2626

@@ -267,6 +267,7 @@ def test_schedule(self):
267267
self.assertParses()
268268

269269
@requires_cuda
270+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
270271
def test_cudagraphs(self):
271272
fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
272273
fn_opt(torch.ones(1000, 1000, device="cuda"))

test/functorch/test_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,7 @@ def fn(inp, *args, **kwargs):
940940
# (3) encountering this error in PyTorch internals.
941941
xfail("index_reduce", "prod"),
942942
decorate(
943-
"linalg.householder_product", decorator=runOnRocm
943+
"linalg.householder_product", decorator=skipIfRocm
944944
), # works on ROCm
945945
xfail(
946946
# nans

test/inductor/test_aot_inductor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,10 @@
4545
IS_WINDOWS,
4646
parametrize,
4747
skipIfRocm,
48+
skipIfRocmArch,
4849
skipIfXpu,
4950
TEST_WITH_ROCM,
51+
NAVI32_ARCH,
5052
)
5153
from torch.testing._internal.custom_tensor import CustomTensorPlainOut
5254
from torch.testing._internal.inductor_utils import GPU_TYPE
@@ -1016,6 +1018,8 @@ def forward(self, q, k, v):
10161018
)
10171019
self.check_model(Model(), example_inputs)
10181020

1021+
# Eager mode produces incorrect tensor values for navi32 during this test
1022+
@skipIfRocmArch(NAVI32_ARCH)
10191023
@unittest.skipIf(IS_FBCODE, "Not yet runnable in fbcode")
10201024
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
10211025
def test_sdpa_2(self):

test/inductor/test_cuda_repro.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
DeterministicGuard,
3838
freeze_rng_state,
3939
IS_FBCODE,
40+
skipIfRocm,
41+
skipIfRocmArch,
4042
TEST_WITH_ASAN,
4143
TEST_WITH_ROCM,
4244
xfailIfPy312Plus,
@@ -73,7 +75,7 @@
7375
sys.exit(0)
7476
raise
7577

76-
78+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
7779
TestCase = test_torchinductor.TestCase
7880
ToTuple = test_torchinductor.ToTuple
7981
check_model_cuda = test_torchinductor.check_model_cuda

test/inductor/test_inductor_freezing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
check_model_gpu,
2929
copy_tests,
3030
)
31-
from torch.testing._internal.common_utils import TEST_WITH_ROCM
31+
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
3232

3333

3434
importlib.import_module("functorch")

test/inductor/test_kernel_benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from torch.testing import FileCheck
1717
from torch.testing._internal.common_cuda import xfailIfSM89
1818
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
19-
19+
from torch.testing._internal.common_utils import skipIfRocm
2020

2121
class TestKernelBenchmark(TestCase):
2222
device_type = GPU_TYPE
@@ -167,6 +167,7 @@ def f(a, b):
167167
max_autotune=True, max_autotune_gemm_backends="TRITON", shape_padding=False
168168
)
169169
@fresh_inductor_cache()
170+
@skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
170171
def test_mm_triton_kernel_benchmark(self):
171172
M = 2048
172173
N = 2432

test/inductor/test_max_autotune.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
IS_WINDOWS,
3535
parametrize,
3636
TEST_WITH_ROCM,
37+
NAVI_ARCH,
38+
skipIfRocmArch,
3739
)
3840
from torch.utils._triton import has_triton_tma_device
3941

@@ -1000,6 +1002,7 @@ def f(x, y):
10001002
act = f(x, y)
10011003
torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
10021004

1005+
@skipIfRocmArch(NAVI_ARCH)
10031006
def test_non_contiguous_input_addmm(self):
10041007
b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
10051008
x = rand_strided(

test/inductor/test_torchinductor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
from torch.testing._internal.common_device_type import (
8080
expectedFailureXPU,
8181
largeTensorTest,
82+
get_desired_device_type_test_bases,
8283
)
8384
from torch.testing._internal.common_dtype import all_types, get_all_dtypes
8485
from torch.testing._internal.common_quantization import (
@@ -98,6 +99,8 @@
9899
skipIfWindows,
99100
skipIfXpu,
100101
subtest,
102+
skipIfRocmArch,
103+
subtest,
101104
TEST_WITH_ASAN,
102105
TEST_WITH_ROCM,
103106
xfailIfS390X,
@@ -141,6 +144,10 @@
141144

142145

143146
HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
147+
_desired_test_bases = get_desired_device_type_test_bases()
148+
RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
149+
RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
150+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
144151

145152
if TEST_WITH_ROCM:
146153
torch._inductor.config.force_layout_optimization = 1
@@ -7579,6 +7586,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
75797586
),
75807587
)
75817588

7589+
@skipIfWindows
7590+
@skipIfRocm
75827591
def test_roi_align(self):
75837592
if not has_torchvision_roi_align():
75847593
raise unittest.SkipTest("requires torchvision")
@@ -8428,6 +8437,7 @@ def fn(a, dim, index, b, reduce):
84288437
)
84298438

84308439
@skip_if_gpu_halide
8440+
# issue #1150
84318441
def test_dense_mask_index(self):
84328442
r"""
84338443
There will be a little difference for reduce order between aten and inductor

test/inductor/test_torchinductor_dynamic_shapes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
TEST_CUDA_MEM_LEAK_CHECK,
3434
TEST_WITH_ASAN,
3535
TEST_WITH_ROCM,
36+
skipIfRocm,
3637
)
3738
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
3839

@@ -246,6 +247,7 @@ def fn(x, y):
246247
self.assertEqual(r, opt_r)
247248

248249
@torch._dynamo.config.patch(capture_scalar_outputs=True)
250+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
249251
def test_unwrap_storage_didnt_work_repro(self, device):
250252
def f():
251253
full = torch.full((), 11)

test/nn/test_convolution.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
parametrize as parametrize_test,
5151
run_tests,
5252
set_default_dtype,
53+
skipIfRocm,
5354
skipIfNotMiopenSuggestNHWC,
5455
skipIfRocmVersionLessThan,
5556
subtest,
@@ -4046,8 +4047,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
40464047
self.assertEqual(grad_weight.shape, weight.shape)
40474048

40484049
@onlyCUDA
4049-
@largeTensorTest("40GB")
4050-
@largeTensorTest("24GB", "cpu")
4050+
@largeTensorTest('40GB')
4051+
@largeTensorTest('24GB', 'cpu')
4052+
# Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
4053+
@skipIfRocm
40514054
def test_conv3d_64bit_indexing(self, device):
40524055
x = torch.rand(1, 32, 512, 512, 256)
40534056
m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)

test/run_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,6 @@ def __contains__(self, item):
168168

169169
ROCM_BLOCKLIST = [
170170
"distributed/rpc/test_faulty_agent",
171-
"distributed/rpc/test_tensorpipe_agent",
172171
"distributed/rpc/test_share_memory",
173172
"distributed/rpc/cuda/test_tensorpipe_agent",
174173
"test_determination",
@@ -177,6 +176,9 @@ def __contains__(self, item):
177176
"test_jit_cuda_fuser",
178177
]
179178

179+
if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
180+
ROCM_BLOCKLIST.append("test_typing")
181+
180182
S390X_BLOCKLIST = [
181183
# these tests fail due to various reasons
182184
"dynamo/test_misc",

test/test_cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2247,9 +2247,8 @@ def test_graph_capture_oom(self):
22472247
with torch.cuda.graph(torch.cuda.CUDAGraph()):
22482248
torch.zeros(2**40, device="cuda")
22492249

2250-
@unittest.skipIf(
2251-
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
2252-
)
2250+
@unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
2251+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
22532252
@serialTest()
22542253
@setBlasBackendsToDefaultFinally
22552254
def test_repeat_graph_capture_cublas_workspace_memory(self):
@@ -3206,6 +3205,7 @@ def forward(self, input_dict: dict):
32063205
@unittest.skipIf(
32073206
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
32083207
)
3208+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
32093209
def test_graph_make_graphed_callables_same_pool(self):
32103210
torch.manual_seed(5)
32113211
torch.cuda.manual_seed(5)

test/test_fx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
IS_MACOS,
6868
IS_WINDOWS,
6969
run_tests,
70+
skipIfRocm,
7071
skipIfTorchDynamo,
7172
)
7273
from torch.testing._internal.jit_utils import JitTestCase

test/test_ops.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1422,6 +1422,20 @@ def test_complex_half_reference_testing(self, device, dtype, op):
14221422

14231423
@ops(op_db, allowed_dtypes=(torch.bool,))
14241424
def test_non_standard_bool_values(self, device, dtype, op):
1425+
if TEST_WITH_ROCM and "cuda" in device:
1426+
rocm_blocklist = [
1427+
"test_non_standard_bool_values_masked_scatter_cuda_bool",
1428+
"test_non_standard_bool_values_nn_functional_unfold_cuda_bool", # only in rocm6.4_internal_testing
1429+
"test_non_standard_bool_values_put_cuda_bool",
1430+
"test_non_standard_bool_values_scatter_add_cuda_bool",
1431+
"test_non_standard_bool_values_scatter_cuda_bool",
1432+
"test_non_standard_bool_values_scatter_reduce_sum_cuda_bool",
1433+
"test_non_standard_bool_values_tril_cuda_bool",
1434+
"test_non_standard_bool_values_triu_cuda_bool",
1435+
]
1436+
if self._testMethodName in rocm_blocklist:
1437+
self.skipTest("Failed on ROCm")
1438+
14251439
# Test boolean values other than 0x00 and 0x01 (gh-54789)
14261440
def convert_boolean_tensors(x):
14271441
if not isinstance(x, torch.Tensor) or x.dtype != torch.bool:

torch/testing/_internal/common_utils.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@
102102
has_pytest = False
103103

104104

105-
MI300_ARCH = ("gfx942",)
105+
MI300_ARCH = ("gfx940", "gfx941", "gfx942")
106+
NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
106107

107108

108109
def freeze_rng_state(*args, **kwargs):
@@ -1349,6 +1350,16 @@ def printErrors(self) -> None:
13491350
IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
13501351
IS_S390X = platform.machine() == "s390x"
13511352

1353+
NAVI32_ARCH = "gfx1101"
1354+
1355+
def is_navi_arch():
1356+
if torch.cuda.is_available():
1357+
prop = torch.cuda.get_device_properties(0)
1358+
gfx_arch = prop.gcnArchName.split(":")[0]
1359+
if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
1360+
return True
1361+
return False
1362+
13521363
def is_avx512_vnni_supported():
13531364
if sys.platform != 'linux':
13541365
return False

torch/testing/_internal/distributed/distributed_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4816,7 +4816,11 @@ def _test_ddp_apply_optim_in_backward(
48164816
# set_to_none for regular optimizer to match in backward
48174817
# case.
48184818
optim.zero_grad(set_to_none=True)
4819-
4819+
4820+
@skip_but_pass_in_sandcastle_if(
4821+
BACKEND == "gloo" and HAS_TORCHVISION,
4822+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4823+
)
48204824
@skip_if_lt_x_gpu(2)
48214825
def test_ddp_apply_optim_in_backward(self):
48224826
for optim_cls, init_before in itertools.product(
@@ -4829,6 +4833,10 @@ def test_ddp_apply_optim_in_backward(self):
48294833
init_before=init_before,
48304834
)
48314835

4836+
@skip_but_pass_in_sandcastle_if(
4837+
BACKEND == "gloo" and HAS_TORCHVISION,
4838+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4839+
)
48324840
@skip_if_lt_x_gpu(2)
48334841
def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
48344842
for init_before in [True, False]:

torch/testing/_internal/distributed/rpc/rpc_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
load_tests,
3939
skip_but_pass_in_sandcastle_if,
4040
get_cycles_per_ms,
41+
skipIfRocm,
4142
)
4243

4344
from torch.testing._internal.dist_utils import (
@@ -5052,6 +5053,7 @@ def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
50525053

50535054
# Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
50545055
@skip_if_lt_x_gpu(2)
5056+
@skipIfRocm
50555057
@dist_init(setup_rpc=False)
50565058
def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
50575059
initialize_pg(self.file_init_method, self.rank, self.world_size)

0 commit comments

Comments
 (0)