diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh
index c419fface4c299..145c2515ea7895 100644
--- a/aten/src/THC/THCAsmUtils.cuh
+++ b/aten/src/THC/THCAsmUtils.cuh
@@ -11,12 +11,11 @@ struct Bitfield<unsigned int> {
   static __device__ __forceinline__
   unsigned int getBitfield(unsigned int val, int pos, int len) {
 #if defined(__HIP_PLATFORM_HCC__)
-    pos &= 0x1f;
-    len &= 0x1f;
+    pos &= 0xff;
+    len &= 0xff;
 
     unsigned int m = (1u << len) - 1u;
-    m <<= pos;
-    return val & m;
+    return (val >> pos) & m;
 #else
     unsigned int ret;
     asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
@@ -27,8 +26,8 @@ struct Bitfield<unsigned int> {
   static __device__ __forceinline__
   unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
 #if defined(__HIP_PLATFORM_HCC__)
-    pos &= 0x1f;
-    len &= 0x1f;
+    pos &= 0xff;
+    len &= 0xff;
 
     unsigned int m = (1u << len) - 1u;
     toInsert &= m;
@@ -50,12 +49,11 @@ struct Bitfield<uint64_t> {
   static __device__ __forceinline__
   uint64_t getBitfield(uint64_t val, int pos, int len) {
 #if defined(__HIP_PLATFORM_HCC__)
-    pos &= 0x1f;
-    len &= 0x1f;
+    pos &= 0xff;
+    len &= 0xff;
 
     uint64_t m = (1u << len) - 1u;
-    m <<= pos;
-    return val & m;
+    return (val >> pos) & m;
 #else
     uint64_t ret;
     asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
@@ -66,8 +64,8 @@ struct Bitfield<uint64_t> {
   static __device__ __forceinline__
   uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
 #if defined(__HIP_PLATFORM_HCC__)
-    pos &= 0x1f;
-    len &= 0x1f;
+    pos &= 0xff;
+    len &= 0xff;
 
     uint64_t m = (1u << len) - 1u;
     toInsert &= m;
@@ -105,16 +103,18 @@ __device__ __forceinline__ unsigned getLaneMaskLt() {
 #endif
 }
 
-__device__ __forceinline__ unsigned getLaneMaskLe() {
-#if defined(__HIP_PLATFORM_HCC__)
-  std::uint64_t m = (1ull << (getLaneId() + 1ull)) - 1ull;
+#if defined (__HIP_PLATFORM_HCC__)
+__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
+  std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
   return m;
+}
 #else
+__device__ __forceinline__ unsigned getLaneMaskLe() {
   unsigned mask;
   asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
   return mask;
-#endif
 }
+#endif
 
 __device__ __forceinline__ unsigned getLaneMaskGt() {
 #if defined(__HIP_PLATFORM_HCC__)
diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh
index 7f16455ff21801..f9c4772ac8f82d 100644
--- a/aten/src/THC/THCDeviceUtils.cuh
+++ b/aten/src/THC/THCDeviceUtils.cuh
@@ -44,6 +44,12 @@ __device__ __forceinline__ unsigned int ACTIVE_MASK()
 #endif
 }
 
+#if defined(__HIP_PLATFORM_HCC__)
+__device__ __forceinline__ unsigned long long int WARP_BALLOT(int predicate)
+{
+   return __ballot(predicate);
+}
+#else
 __device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff)
 {
 #if CUDA_VERSION >= 9000
@@ -52,6 +58,7 @@ __device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int
     return __ballot(predicate);
 #endif
 }
+#endif
 
 #ifdef __HIP_PLATFORM_HCC__
 //To handle ambiguity, add a type double version.
diff --git a/aten/src/THC/THCScanUtils.cuh b/aten/src/THC/THCScanUtils.cuh
index d5542383560c32..c03ded0c8dd839 100644
--- a/aten/src/THC/THCScanUtils.cuh
+++ b/aten/src/THC/THCScanUtils.cuh
@@ -159,9 +159,15 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct
 template <typename T, bool KillWARDependency, class BinaryFunction>
 __device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
   // Within-warp, we use warp voting.
+#if defined (__HIP_PLATFORM_HCC__)
+  unsigned long long int vote = WARP_BALLOT(in);
+  T index = __popcll(getLaneMaskLe() & vote);
+  T carry = __popcll(vote);
+#else
   T vote = WARP_BALLOT(in);
   T index = __popc(getLaneMaskLe() & vote);
   T carry = __popc(vote);
+#endif
 
   int warp = threadIdx.x / SCAN_UTILS_WARP_SIZE;
 
@@ -207,7 +213,7 @@ __device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, Bi
   *out -= (T) in;
 
   // The outgoing carry for all threads is the last warp's sum
-  *carry = smem[(blockDim.x / SCAN_UTILS_WARP_SIZE) - 1];
+  *carry = smem[THCCeilDiv<int>(blockDim.x, SCAN_UTILS_WARP_SIZE) - 1];
 
   if (KillWARDependency) {
     __syncthreads();
diff --git a/aten/src/THC/THCTensorTopK.cuh b/aten/src/THC/THCTensorTopK.cuh
index 6d1ef1b6bbfe37..71d1bc98e8e286 100644
--- a/aten/src/THC/THCTensorTopK.cuh
+++ b/aten/src/THC/THCTensorTopK.cuh
@@ -176,7 +176,11 @@ __device__ void countRadixUsingMask(CountType counts[RadixSize],
 #pragma unroll
     for (unsigned int j = 0; j < RadixSize; ++j) {
       bool vote = hasVal && (digitInRadix == j);
+#if defined (__HIP_PLATFORM_HCC__)
+      counts[j] += __popcll(WARP_BALLOT(vote));
+#else
       counts[j] += __popc(WARP_BALLOT(vote, ACTIVE_MASK()));
+#endif
     }
   }
 
diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu
index d60fa0c4da96fb..7d9ade78b74179 100644
--- a/aten/src/THC/generic/THCTensorSort.cu
+++ b/aten/src/THC/generic/THCTensorSort.cu
@@ -309,7 +309,12 @@ void THCTensor_(sort)(THCState* state,
   int maxSliceSize = 2048;
 #endif
 
+#ifdef __HIP_PLATFORM_HCC__
+  // TODO bitonicSortKVInPlace hangs on ROCm currently.
+  if (0) {
+#else
   if (sliceSize <= maxSliceSize) {
+#endif
     // Fill `indices` (the values) with the
     // slice-relative index.
     THCudaLongTensor_fillSliceWithIndex(state, indices, dim);
diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu
index a195dfbe5ca7a8..f8e31a7c2e6371 100644
--- a/aten/src/THC/generic/THCTensorTopK.cu
+++ b/aten/src/THC/generic/THCTensorTopK.cu
@@ -140,7 +140,12 @@ void THCTensor_(topk)(THCState* state,
   if (sorted) {
     // FIXME: the k/v inplace sort along slice only works for size <=
     // 2048 at the moment
+#ifdef __HIP_PLATFORM_HCC__
+    // TODO bitonicSortKVInPlace hangs on ROCm currently.
+    if (0) {
+#else
     if (sliceSize <= 2048) {
+#endif
       // This avoids any memory allocations and performs all sorting
       // work inplace along the slice
       THCTensor_(sortKeyValueInplace)(state, topK, indices, dim, dir);
diff --git a/test/test_torch.py b/test/test_torch.py
index 61b94f5c054082..90561e36ccc4ad 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6577,6 +6577,7 @@ def test_tensor_shape_empty(self):
             self.assertEqual([(0, 1, 3, 0)], [z.shape for z in torch.split(x, 0, dim=0)])
 
     # functions that operate over a dimension but don't reduce.
+    @skipIfRocm
     def test_dim_function_empty(self):
         devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
         for device in devices: