diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp
index 58248acfe1795..0a4649d9c41ad 100644
--- a/aten/src/ATen/cuda/CUDAContext.cpp
+++ b/aten/src/ATen/cuda/CUDAContext.cpp
@@ -54,15 +54,13 @@ Allocator* getCUDADeviceAllocator() {
 }
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
-  cusparseHandle_t getCurrentCUDASparseHandle() {
-    return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
-  }
+cusparseHandle_t getCurrentCUDASparseHandle() {
+  return THCState_getCurrentSparseHandle(at::globalContext().getTHCState());
+}
 
-  cublasHandle_t getCurrentCUDABlasHandle() {
-    return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
-  }
-#endif
+cublasHandle_t getCurrentCUDABlasHandle() {
+  return THCState_getCurrentBlasHandle(at::globalContext().getTHCState());
+}
 
 } // namespace cuda
 
diff --git a/aten/src/ATen/cuda/CUDAContext.h b/aten/src/ATen/cuda/CUDAContext.h
index 83a890da4d535..3a480d2ca4e4e 100644
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@@ -59,10 +59,8 @@ CAFFE2_API void uncheckedSetCurrentCUDAStream(CUDAStream stream);
 CAFFE2_API Allocator* getCUDADeviceAllocator();
 
 /* Handles */
-#ifndef __HIP_PLATFORM_HCC__
 CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
 CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
-#endif
 
 
 } // namespace cuda
diff --git a/test/test_autograd.py b/test/test_autograd.py
index f9ccfb6c958e9..0642e87399c67 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -1406,6 +1406,7 @@ def test_unused_output(self):
         expected_grad[:2] = grad_output
         self.assertEqual(x.grad.data, expected_grad)
 
+    @skipIfRocm
     def test_ctc_loss(self):
         batch_size = 64
         num_labels = 101
diff --git a/test/test_cuda.py b/test/test_cuda.py
index cdf8d46ce236c..2c647b08cbd60 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -268,11 +268,11 @@ def tmp(t):
     ('div', small_3d, lambda t: [number(3.14, 3, t)], '', types, False,
         "skipIfRocm:ByteTensor,CharTensor,FloatTensor,HalfTensor,ShortTensor"),
     ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', types, False, "skipIfRocm:HalfTensor"),
-    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types, False, "skipIfRocm:HalfTensor"),
+    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
+    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'),
+    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'),
+    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'),
+    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
     # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
     ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half,
         False, "skipIfRocm:HalfTensor,FloatTensor"),
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 020486c1fbda3..3d9af20c85965 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -371,6 +371,7 @@ def test_segfault(self):
         finally:
             p.terminate()
 
+    @skipIfRocm
     def test_timeout(self):
         p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
diff --git a/test/test_jit.py b/test/test_jit.py
index 22e7a5f69b467..b2a83e00bcc46 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -7243,6 +7243,7 @@ def test_dcgan_models(self):
         self._test_dcgan_models(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_dcgan_models_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_dcgan_models(self, device='cuda', check_export_import=False)
@@ -7365,11 +7366,13 @@ def test_mnist(self):
         self._test_mnist(self, device='cpu')
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_mnist(self, device='cuda', check_export_import=False)
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @skipIfRocm
     def test_mnist_training_leaks_no_memory_cuda(self):
         net = MnistNet().cuda()
         # MnistNet uses dropout, don't check its trace
diff --git a/test/test_nn.py b/test/test_nn.py
index 0d61d72f3ceb6..eee4e3a7c7475 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -4202,6 +4202,7 @@ def get_inputs(input_shape, hidden_shape, mode):
             test(input_shape, hidden_shape, mode)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
+    @skipIfRocm
     def test_rnn_check_device(self):
         input_size = 3
         hidden_size = 5
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 0e91dca37d4c3..f95d7256c4042 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1033,6 +1033,7 @@ def _all_narrow_combs(self, shape):
                 for length in range(dim_sz - start):
                     yield [dim, start, length]
 
+    @skipIfRocm
     def test_narrow(self):
         shape = [3, 3, 4, 2]
         input, _, _ = self._gen_sparse(4, 19, shape)
@@ -1437,6 +1438,7 @@ def test_tensor(indices, values, indices_equal, values_equal):
         test_tensor(indices, values, False, True)  # An empty tensor's data_ptr is always equal to 0
 
     @cpu_only  # just run once, we test both cpu and cuda
+    @skipIfRocm
     def test_constructor_device_legacy(self):
         i = torch.tensor([[0, 1, 1], [2, 0, 2]])
         v = torch.tensor([3., 4., 5.])
@@ -1583,6 +1585,7 @@ def test_resize(self):
             self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
                                     [1, 1], [1, 2, 0], [2, 2, 0])
 
+    @skipIfRocm
     def test_is_nonzero(self):
         self.assertTrue(torch.sparse_coo_tensor(([0],), 1., (1,)).is_nonzero())
         self.assertFalse(torch.sparse_coo_tensor(([0],), 0., (1,)).is_nonzero())
diff --git a/test/test_torch.py b/test/test_torch.py
index 84ef8a22e050b..3026548b99043 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -3999,6 +3999,7 @@ def test_is_signed_cuda(self):
         self.assertEqual(torch.cuda.HalfTensor(10).is_signed(), True)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gesv(self):
         a = torch.Tensor(((6.80, -2.11, 5.66, 5.97, 8.23),
                           (-6.05, -3.30, 5.36, -4.44, 1.08),
@@ -4130,6 +4131,7 @@ def test_gesv_batched_dims(self):
         self._test_gesv_batched_dims(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_qr(self):
 
         # Since the QR decomposition is unique only up to the signs of the rows of
@@ -4312,10 +4314,12 @@ def _test_trtrs(self, cast):
         self.assertEqual(res1, tb, 0)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_trtrs(self):
         self._test_trtrs(self, lambda t: t)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_gels(self):
         def _test_underdetermined(a, b, expectedNorm):
             m = a.size()[0]
@@ -4431,6 +4435,7 @@ def check_norm(a, b, expected_norm, gels_result):
         self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
 
     @skipIfNoLapack
+    @skipIfRocm
     def test_eig(self):
         a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
                           (-6.49, 3.80, 0.00, 0.00, 0.00),