Skip to content

Commit 991921d

Browse files
committed
CONSOLIDATED COMMITS: unit test skips and unskips
================================================= Temporarily skip test_conv3d_64bit_indexing - Rocblas API support is requested - SWDEV-383635 & sub task - SWDEV-390218 Skip ddp apply_optim_in_bwd tests for gloo (#1302) To resolve https://ontrack-internal.amd.com/browse/SWDEV-403530 and https://ontrack-internal.amd.com/browse/SWDEV-419837. For more context check upstream issue pytorch#111834 Add skipIfRocmArch decorator for Navi skips (#1356) Converted NAVI check as a function (#1364) * Moved NAVI check to the test file * Revised NAVI check as a function [Navi] [Inductor] Unskip Navi inductor UTs (#1514) Relates to https://ontrack-internal.amd.com/browse/SWDEV-461590 Bad import in test_torchinductor and skip torchvision related UT (#1374) skip test_inductor_freezing failing UTs (#1375) Skip test_mm_triton_kernel_benchmark (#1376) * Running triton kernel on ROCM only has one GB/s metric reported * Update test_kernel_benchmark.py skip vmapvjpvjp_linalg_householder_product_cuda_float32 (#1420) skipIfRocm needs msg parameter [NO CP] Updated changes to skip few UTs Imported skipIfRocm in certain test suites (#1577) Fixes SWDEV-472397 Added functions imports (#1521) Fixes inductor.test_torchinductor_dynamic_shapes::TestInductorDynamicCUDA::test_item_unbacked_stride_nobreak_cuda Enable test_public_api_surface (#1601) Fixes SWDEV-462410. Enable this unit test since PyTorch issue pytorch#104012 has been closed. This unit test runs fine on MI100/MI300 and upstream. (cherry picked from commit 0001d4ab5070635cfecc146ee299bbb9fa45ca67) [rocm6.3_internal_testing] Fixed error string assertion in test_invalid_devices (#1607) Fixes pytorch#8974 (cherry picked from commit a688e0a)
1 parent 81928e9 commit 991921d

13 files changed

+62
-11
lines changed

test/dynamo/test_structured_trace.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from torch._inductor.test_case import TestCase
2121
from torch._logging._internal import TorchLogsFormatter
2222
from torch.nn.parallel import DistributedDataParallel as DDP
23-
from torch.testing._internal.common_utils import find_free_port
23+
from torch.testing._internal.common_utils import find_free_port, skipIfRocm
2424
from torch.testing._internal.inductor_utils import HAS_CUDA
2525

2626

@@ -231,6 +231,7 @@ def test_schedule(self):
231231
self.assertParses()
232232

233233
@requires_cuda
234+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
234235
def test_cudagraphs(self):
235236
fn_opt = torch.compile(mode="reduce-overhead")(inductor_schedule_fn)
236237
fn_opt(torch.ones(1000, 1000, device="cuda"))

test/functorch/test_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ def fn(inp, *args, **kwargs):
950950
# (3) encountering this error in PyTorch internals.
951951
xfail("index_reduce", "prod"),
952952
decorate(
953-
"linalg.householder_product", decorator=runOnRocm
953+
"linalg.householder_product", decorator=skipIfRocm
954954
), # works on ROCm
955955
xfail(
956956
# nans

test/inductor/test_cuda_repro.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
freeze_rng_state,
3434
IS_FBCODE,
3535
skipIfRocm,
36+
skipIfRocmArch,
3637
TEST_WITH_ASAN,
3738
)
3839

@@ -59,7 +60,7 @@
5960
sys.exit(0)
6061
raise
6162

62-
63+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
6364
TestCase = test_torchinductor.TestCase
6465
ToTuple = test_torchinductor.ToTuple
6566
check_model_cuda = test_torchinductor.check_model_cuda

test/inductor/test_inductor_freezing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
check_model_gpu,
2929
copy_tests,
3030
)
31-
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
31+
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM, skipIfRocm
3232

3333

3434
importlib.import_module("functorch")

test/inductor/test_kernel_benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from torch.testing import FileCheck
1616
from torch.testing._internal.common_device_type import expectedFailureXPU
1717
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
18-
18+
from torch.testing._internal.common_utils import skipIfRocm
1919

2020
class TestKernelBenchmark(TestCase):
2121
device_type = GPU_TYPE
@@ -151,6 +151,7 @@ def f(a, b):
151151
@expectedFailureXPU
152152
@config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
153153
@fresh_inductor_cache()
154+
@skipIfRocm #This seems to be disabled upstream https://github.com/pytorch/pytorch/issues/118346
154155
def test_mm_triton_kernel_benchmark(self):
155156
M = 2048
156157
N = 2432

test/inductor/test_torchinductor.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
from torch.testing._internal.common_device_type import (
7272
_has_sufficient_memory,
7373
expectedFailureXPU,
74+
get_desired_device_type_test_bases,
7475
)
7576
from torch.testing._internal.common_dtype import all_types, get_all_dtypes
7677
from torch.testing._internal.common_quantization import (
@@ -89,6 +90,8 @@
8990
skipIfWindows,
9091
skipIfXpu,
9192
subtest,
93+
skipIfRocmArch,
94+
subtest,
9295
TEST_WITH_ASAN,
9396
TEST_WITH_ROCM,
9497
xfailIfS390X,
@@ -124,6 +127,10 @@
124127

125128

126129
HAS_AVX2 = "fbgemm" in torch.backends.quantized.supported_engines
130+
_desired_test_bases = get_desired_device_type_test_bases()
131+
RUN_CPU = any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
132+
RUN_GPU = any(getattr(x, "device_type", "") == GPU_TYPE for x in _desired_test_bases)
133+
NAVI_ARCH = ("gfx1100", "gfx1101") # Used for navi exclusive skips on ROCm
127134

128135
aten = torch.ops.aten
129136

@@ -7057,6 +7064,8 @@ def fn(in_ptr0, in_ptr1, in_ptr2):
70577064
),
70587065
)
70597066

7067+
@skipIfWindows
7068+
@skipIfRocm
70607069
def test_roi_align(self):
70617070
if not has_torchvision_roi_align():
70627071
raise unittest.SkipTest("requires torchvision")
@@ -7909,6 +7918,7 @@ def fn(a, dim, index, b, reduce):
79097918
)
79107919

79117920
@skip_if_gpu_halide
7921+
# issue #1150
79127922
def test_dense_mask_index(self):
79137923
r"""
79147924
There will be a little difference for reduce order between aten and inductor

test/inductor/test_torchinductor_dynamic_shapes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
TEST_CUDA_MEM_LEAK_CHECK,
3333
TEST_WITH_ASAN,
3434
TEST_WITH_ROCM,
35+
skipIfRocm,
3536
)
3637
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
3738

@@ -241,6 +242,7 @@ def fn(x, y):
241242
self.assertEqual(r, opt_r)
242243

243244
@torch._dynamo.config.patch(capture_scalar_outputs=True)
245+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
244246
def test_unwrap_storage_didnt_work_repro(self, device):
245247
def f():
246248
full = torch.full((), 11)

test/nn/test_convolution.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
parametrize as parametrize_test,
5858
run_tests,
5959
set_default_dtype,
60+
skipIfRocm,
6061
skipIfNotMiopenSuggestNHWC,
6162
skipIfRocmVersionLessThan,
6263
subtest,
@@ -4077,8 +4078,10 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
40774078
self.assertEqual(grad_weight.shape, weight.shape)
40784079

40794080
@onlyCUDA
4080-
@largeTensorTest("40GB")
4081-
@largeTensorTest("24GB", "cpu")
4081+
@largeTensorTest('40GB')
4082+
@largeTensorTest('24GB', 'cpu')
4083+
# Skipped for ROCm temp - https://ontrack-internal.amd.com/browse/SWDEV-383635
4084+
@skipIfRocm
40824085
def test_conv3d_64bit_indexing(self, device):
40834086
x = torch.rand(1, 32, 512, 512, 256)
40844087
m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False)

test/run_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,9 @@ def __contains__(self, item):
185185
"distributed/_tensor/test_attention",
186186
]
187187

188+
if sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor <= 9):
189+
ROCM_BLOCKLIST.append("test_typing")
190+
188191
# whitelist of tests for s390x
189192
S390X_TESTLIST = [
190193
"backends/xeon/test_launch.py",

test/test_cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1926,9 +1926,8 @@ def test_graph_capture_oom(self):
19261926
with torch.cuda.graph(torch.cuda.CUDAGraph()):
19271927
torch.zeros(2**40, device="cuda")
19281928

1929-
@unittest.skipIf(
1930-
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
1931-
)
1929+
@unittest.skipIf(not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs")
1930+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
19321931
@serialTest()
19331932
@setBlasBackendsToDefaultFinally
19341933
def test_repeat_graph_capture_cublas_workspace_memory(self):
@@ -2886,6 +2885,7 @@ def forward(self, input_dict: dict):
28862885
@unittest.skipIf(
28872886
not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
28882887
)
2888+
@skipIfRocm(msg="TODO: temp skip on ROCm 6.2")
28892889
def test_graph_make_graphed_callables_same_pool(self):
28902890
torch.manual_seed(5)
28912891
torch.cuda.manual_seed(5)

test/test_fx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
IS_WINDOWS,
5858
find_library_location,
5959
run_tests,
60+
skipIfRocm,
6061
skipIfTorchDynamo,
6162
)
6263
from torch.testing._internal.jit_utils import JitTestCase

torch/testing/_internal/common_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1360,6 +1360,14 @@ def printErrors(self) -> None:
13601360
IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
13611361
IS_S390X = platform.machine() == "s390x"
13621362

1363+
def is_navi_arch():
1364+
if torch.cuda.is_available():
1365+
prop = torch.cuda.get_device_properties(0)
1366+
gfx_arch = prop.gcnArchName.split(":")[0]
1367+
if gfx_arch in ["gfx1100", "gfx1101", "gfx1102"]:
1368+
return True
1369+
return False
1370+
13631371
def is_avx512_vnni_supported():
13641372
if sys.platform != 'linux':
13651373
return False
@@ -1840,6 +1848,19 @@ def wrapper(*args, **kwargs):
18401848
return dec_fn(func)
18411849
return dec_fn
18421850

1851+
def skipIfRocmArch(arch: Tuple[str, ...]):
1852+
def dec_fn(fn):
1853+
@wraps(fn)
1854+
def wrap_fn(self, *args, **kwargs):
1855+
if TEST_WITH_ROCM:
1856+
prop = torch.cuda.get_device_properties(0)
1857+
if prop.gcnArchName.split(":")[0] in arch:
1858+
reason = f"skipIfRocm: test skipped on {arch}"
1859+
raise unittest.SkipTest(reason)
1860+
return fn(self, *args, **kwargs)
1861+
return wrap_fn
1862+
return dec_fn
1863+
18431864
def runOnRocm(fn):
18441865
@wraps(fn)
18451866
def wrapper(*args, **kwargs):

torch/testing/_internal/distributed/distributed_test.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4817,7 +4817,11 @@ def _test_ddp_apply_optim_in_backward(
48174817
# set_to_none for regular optimizer to match in backward
48184818
# case.
48194819
optim.zero_grad(set_to_none=True)
4820-
4820+
4821+
@skip_but_pass_in_sandcastle_if(
4822+
BACKEND == "gloo" and HAS_TORCHVISION,
4823+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4824+
)
48214825
@skip_if_lt_x_gpu(2)
48224826
def test_ddp_apply_optim_in_backward(self):
48234827
for optim_cls, init_before in itertools.product(
@@ -4830,6 +4834,10 @@ def test_ddp_apply_optim_in_backward(self):
48304834
init_before=init_before,
48314835
)
48324836

4837+
@skip_but_pass_in_sandcastle_if(
4838+
BACKEND == "gloo" and HAS_TORCHVISION,
4839+
"Failing with gloo backend + torchvision due to ongoing issue https://github.com/pytorch/pytorch/issues/111834",
4840+
)
48334841
@skip_if_lt_x_gpu(2)
48344842
def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
48354843
for init_before in [True, False]:

0 commit comments

Comments
 (0)