Skip to content

Commit b9c976d

Browse files
pruthvistonyjithunnair-amd
authored andcommitted
CONSOLIDATED COMMITS: unit test skips and unskips
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a)
1 parent d260fe8 commit b9c976d

14 files changed

+69
-12
lines changed

test/dynamo/test_structured_trace.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from torch._inductor.test_case import TestCase
2020
from torch._logging._internal import TorchLogsFormatter
2121
from torch.nn.parallel import DistributedDataParallel as DDP
22-
from torch.testing._internal.common_utils import find_free_port
22+
from torch.testing._internal.common_utils import find_free_port, skipIfRocm
2323
from torch.testing._internal.inductor_utils import HAS_CUDA
2424

2525

@@ -208,6 +208,7 @@ def test_schedule(self):
208208
self.assertParses()
209209

210210
@requires_cuda
211+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
211212
def test_cudagraphs(self):
212213
fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
213214
fn_opt(torch.ones(1000, 1000, device="cuda"))

test/functorch/test_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,7 @@ def fn(inp, *args, **kwargs):
951951
# (3) encountering this error in PyTorch internals.
952952
xfail("index_reduce", "prod"),
953953
decorate(
954-
"linalg.householder_product", decorator=runOnRocm
954+
"linalg.householder_product", decorator=skipIfRocm
955955
), # works on ROCm
956956
xfail(
957957
# nans

test/inductor/test_cuda_repro.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
freeze_rng_state,
3434
IS_FBCODE,
3535
skipIfRocm,
36+
skipIfRocmArch,
3637
TEST_WITH_ASAN,
3738
)
3839

@@ -59,7 +60,7 @@
5960
sys.exit(0)
6061
raise
6162

62-
63+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
6364
TestCase = test_torchinductor.TestCase
6465
ToTuple = test_torchinductor.ToTuple
6566
check_model_cuda = test_torchinductor.check_model_cuda

test/inductor/test_inductor_freezing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
check_model_cuda,
2929
copy_tests,
3030
)
31-
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
31+
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
3232

3333

3434
importlib.import_module("functorch")

test/inductor/test_kernel_benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from torch.testing import FileCheck
1616
from torch.testing._internal.common_device_type import expectedFailureXPU
1717
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
18-
18+
from torch.testing._internal.common_utils import skipIfRocm
1919

2020
class TestKernelBenchmark(TestCase):
2121
device_type = GPU_TYPE
@@ -151,6 +151,7 @@ def f(a, b):
151151
@expectedFailureXPU
152152
@config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
153153
@fresh_inductor_cache()
154+
@skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
154155
def test_mm_triton_kernel_benchmark(self):
155156
M = 2048
156157
N = 2432

test/inductor/test_torchinductor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from torch.testing._internal.common_device_type import (
7171
_has_sufficient_memory,
7272
expectedFailureXPU,
73+
get_desired_device_type_test_bases,
7374
)
7475
from torch.testing._internal.common_dtype import all_types, get_all_dtypes
7576
from torch.testing._internal.common_utils import (
@@ -85,6 +86,8 @@
8586
skipIfWindows,
8687
skipIfXpu,
8788
subtest,
89+
skipIfRocmArch,
90+
subtest,
8891
TEST_WITH_ASAN,
8992
TEST_WITH_ROCM,
9093
)
@@ -119,6 +122,10 @@
119122

120123

121124
HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
125+
_desired_test_bases = get_desired_device_type_test_bases()
126+
RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
127+
RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
128+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
122129

123130
aten = torch.ops.aten
124131

@@ -6943,6 +6950,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
69436950
),
69446951
)
69456952

6953+
@skipIfWindows
6954+
@skipIfRocm
69466955
def test_roi_align(self):
69476956
if not has_torchvision_roi_align():
69486957
raise unittest.SkipTest("requires torchvision")
@@ -7787,6 +7796,7 @@ def fn(a, dim, index, b, reduce):
77877796
)
77887797

77897798
@skip_if_gpu_halide
7799+
# issue #1150
77907800
def test_dense_mask_index(self):
77917801
r"""
77927802
There will be a little difference for reduce order between aten and inductor

test/inductor/test_torchinductor_dynamic_shapes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
TEST_CUDA_MEM_LEAK_CHECK,
3333
TEST_WITH_ASAN,
3434
TEST_WITH_ROCM,
35+
skipIfRocm,
3536
)
3637
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
3738

@@ -241,6 +242,7 @@ def fn(x, y):
241242
self.assertEqual(r, opt_r)
242243

243244
@torch._dynamo.config.patch(capture_scalar_outputs=True)
245+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
244246
def test_unwrap_storage_didnt_work_repro(self, device):
245247
def f():
246248
full = torch.full((), 11)

test/nn/test_convolution.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
parametrize as parametrize_test,
5858
run_tests,
5959
set_default_dtype,
60+
skipIfRocm,
6061
skipIfNotMiopenSuggestNHWC,
6162
skipIfRocmVersionLessThan,
6263
subtest,
@@ -4081,8 +4082,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
40814082
self.assertEqual(grad_weight.shape, weight.shape)
40824083

40834084
@onlyCUDA
4084-
@largeTensorTest("40GB")
4085-
@largeTensorTest("24GB", "cpu")
4085+
@largeTensorTest('40GB')
4086+
@largeTensorTest('24GB', 'cpu')
4087+
# Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
4088+
@skipIfRocm
40864089
def test_conv3d_64bit_indexing(self, device):
40874090
x = torch.rand(1, 32, 512, 512, 256)
40884091
m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)

test/run_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ def __contains__(self, item):
185185
"distributed/_tensor/test_attention",
186186
]
187187

188+
if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
189+
ROCM_BLOCKLIST.append("test_typing")
190+
188191
XPU_BLOCKLIST = [
189192
"test_autograd",
190193
"profiler/test_cpp_thread",

test/test_cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1920,9 +1920,8 @@ def test_graph_capture_oom(self):
19201920
with torch.cuda.graph(torch.cuda.CUDAGraph()):
19211921
torch.zeros(2**40, device="cuda")
19221922

1923-
@unittest.skipIf(
1924-
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1925-
)
1923+
@unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
1924+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
19261925
@serialTest()
19271926
def test_repeat_graph_capture_cublas_workspace_memory(self):
19281927
(x, y, z) = 1024, 512, 64
@@ -2878,6 +2877,7 @@ def forward(self, input_dict: dict):
28782877
@unittest.skipIf(
28792878
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
28802879
)
2880+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
28812881
def test_graph_make_graphed_callables_same_pool(self):
28822882
torch.manual_seed(5)
28832883
torch.cuda.manual_seed(5)

test/test_fx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
IS_WINDOWS,
5858
find_library_location,
5959
run_tests,
60+
skipIfRocm,
6061
skipIfTorchDynamo,
6162
)
6263
from torch.testing._internal.jit_utils import JitTestCase

torch/testing/_internal/common_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,14 @@ def printErrors(self) -> None:
12791279
IS_X86 = platform.machine() in ('x86_64', 'i386')
12801280
IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
12811281

1282+
def is_navi_arch():
1283+
if torch.cuda.is_available():
1284+
prop = torch.cuda.get_device_properties(0)
1285+
gfx_arch = prop.gcnArchName.split(":")[0]
1286+
if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
1287+
return True
1288+
return False
1289+
12821290
def is_avx512_vnni_supported():
12831291
if sys.platform != 'linux':
12841292
return False
@@ -1758,6 +1766,19 @@ def wrapper(*args, **kwargs):
17581766
return dec_fn(func)
17591767
return dec_fn
17601768

1769+
def skipIfRocmArch(arch: Tuple[str, ...]):
1770+
def dec_fn(fn):
1771+
@wraps(fn)
1772+
def wrap_fn(self, *args, **kwargs):
1773+
if TEST_WITH_ROCM:
1774+
prop = torch.cuda.get_device_properties(0)
1775+
if prop.gcnArchName.split(":")[0] in arch:
1776+
reason = f"skipIfRocm: test skipped on {arch}"
1777+
raise unittest.SkipTest(reason)
1778+
return fn(self, *args, **kwargs)
1779+
return wrap_fn
1780+
return dec_fn
1781+
17611782
def runOnRocm(fn):
17621783
@wraps(fn)
17631784
def wrapper(*args, **kwargs):

torch/testing/_internal/distributed/distributed_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4863,7 +4863,11 @@ def _test_ddp_apply_optim_in_backward(
48634863
# set_to_none for regular optimizer to match in backward
48644864
# case.
48654865
optim.zero_grad(set_to_none=True)
4866-
4866+
4867+
@skip_but_pass_in_sandcastle_if(
4868+
BACKEND == "gloo" and HAS_TORCHVISION,
4869+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4870+
)
48674871
@skip_if_lt_x_gpu(2)
48684872
def test_ddp_apply_optim_in_backward(self):
48694873
for optim_cls, init_before in itertools.product(
@@ -4876,6 +4880,10 @@ def test_ddp_apply_optim_in_backward(self):
48764880
init_before=init_before,
48774881
)
48784882

4883+
@skip_but_pass_in_sandcastle_if(
4884+
BACKEND == "gloo" and HAS_TORCHVISION,
4885+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4886+
)
48794887
@skip_if_lt_x_gpu(2)
48804888
def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
48814889
for init_before in [True, False]:

torch/testing/_internal/distributed/nn/api/remote_module_test.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,8 +613,14 @@ def test_invalid_devices(self):
613613
)
614614
]
615615

616+
if TEST_WITH_ROCM:
617+
errorString = (r"HIP error: invalid device ordinal\n"
618+
r"HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\n"
619+
r"For debugging consider passing AMD_SERIALIZE_KERNEL=3")
620+
else:
621+
errorString = r"CUDA error: invalid device ordinal"
616622
with self.assertRaisesRegex(
617-
RuntimeError, r"CUDA error: invalid device ordinal"
623+
RuntimeError, errorString
618624
):
619625
[
620626
m.forward()

0 commit comments

Comments
 (0)