Add ROCm support

ardfork · BlankParenthesis · ardfork · commit b99271960048 · 2023-06-06T22:55:21.000Z
Co-authored-by: [  ] &lt;blank.parenthesis@gmail.com&gt;
diff --git a/cuda_ext.py b/cuda_ext.py
@@ -53,8 +53,10 @@ def find_msvc():
         os.path.join(library_dir, "exllama_ext/cuda_func/q4_mlp.cu"),
         os.path.join(library_dir, "exllama_ext/cpu_func/rep_penalty.cpp")
     ],
+    extra_include_paths = [os.path.join(library_dir, "exllama_ext")],
     verbose = verbose,
-    extra_ldflags = ["cublas.lib"] if windows else []
+    extra_ldflags = ["cublas.lib"] if windows else [],
+    extra_cuda_cflags = ["-U__HIP_NO_HALF_CONVERSIONS__"] if torch.version.hip else []
     # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]
 )
 
diff --git a/exllama_ext/cuda_compat.cuh b/exllama_ext/cuda_compat.cuh
@@ -41,8 +41,8 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 
 //
 
-#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ < 700
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
 
 __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
diff --git a/exllama_ext/cuda_func/half_matmul.cu b/exllama_ext/cuda_func/half_matmul.cu
@@ -2,6 +2,9 @@
 #include "../util.cuh"
 #include "../matrix.cuh"
 #include "../cuda_compat.cuh"
+#if defined(USE_ROCM)
+#include "../hip_compat.cuh"
+#endif
 
 // Block size
 
diff --git a/exllama_ext/cuda_func/half_matmul.cuh b/exllama_ext/cuda_func/half_matmul.cuh
@@ -6,6 +6,12 @@
 #include <cstdint>
 #include <ATen/cuda/CUDAContext.h>
 
+// Workaround for hipify_python using rocblas instead of hipblas.
+#if defined(USE_ROCM)
+#include <hipblas/hipblas.h>
+#define rocblas_handle hipblasHandle_t
+#endif
+
 void half_matmul_cuda
 (
     const half* x,
diff --git a/exllama_ext/cuda_func/q4_matmul.cu b/exllama_ext/cuda_func/q4_matmul.cu
@@ -4,6 +4,9 @@
 #include "../matrix.cuh"
 #include "../cuda_compat.cuh"
 #include "../cuda_buffers.cuh"
+#if defined(USE_ROCM)
+#include "../hip_compat.cuh"
+#endif
 
 const int THREADS_X = 32;       // Block size and thread count along columns in w and out
 const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
diff --git a/exllama_ext/cuda_func/q4_matmul.cuh b/exllama_ext/cuda_func/q4_matmul.cuh
@@ -10,6 +10,12 @@
 #include "q4_matrix.cuh"
 #include "../tuning.h"
 
+// Workaround for hipify_python using rocblas instead of hipblas.
+#if defined(USE_ROCM)
+#include <hipblas/hipblas.h>
+#define rocblas_handle hipblasHandle_t
+#endif
+
 void q4_matmul_cuda
 (
     ExLlamaTuning* tuningParams,
diff --git a/exllama_ext/cuda_func/q4_mlp.cu b/exllama_ext/cuda_func/q4_mlp.cu
@@ -4,6 +4,9 @@
 #include "../cuda_buffers.cuh"
 #include "../util.cuh"
 #include "../matrix.cuh"
+#if defined(USE_ROCM)
+#include "../hip_compat.cuh"
+#endif
 
 const int THREADS_X = 32;
 const int THREADS_Y = 4;
diff --git a/exllama_ext/hip_compat.cuh b/exllama_ext/hip_compat.cuh
@@ -0,0 +1,40 @@
+#ifndef _hip_compat_cuh
+#define _hip_compat_cuh
+
+// Workaround for a bug in hipamd, backported from upstream.
+__device__ __forceinline__ __half __compat_hrcp(__half x) {
+    return __half_raw{
+        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
+}
+
+__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
+    return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
+        static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
+}
+
+#define hrcp __compat_hrcp
+#define h2rcp __compat_h2rcp
+
+// Workaround for hipify_python using rocblas instead of hipblas.
+__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
+                                                               hipblasOperation_t transA,
+                                                               hipblasOperation_t transB,
+                                                               int                m,
+                                                               int                n,
+                                                               int                k,
+                                                               const half*        alpha,
+                                                               const half*        AP,
+                                                               int                lda,
+                                                               const half*        BP,
+                                                               int                ldb,
+                                                               const half*        beta,
+                                                               half*              CP,
+                                                               int                ldc) {
+    return hipblasHgemm(handle, transA, transB, m, n, k, reinterpret_cast<const hipblasHalf*>(alpha), reinterpret_cast<const hipblasHalf*>(AP), lda, reinterpret_cast<const hipblasHalf*>(BP), ldb, reinterpret_cast<const hipblasHalf*>(beta), reinterpret_cast<hipblasHalf*>(CP), ldc);
+}
+
+#define rocblas_handle hipblasHandle_t
+#define rocblas_operation_none HIPBLAS_OP_N
+#define rocblas_hgemm __compat_hipblasHgemm
+
+#endif
diff --git a/exllama_ext/util.cuh b/exllama_ext/util.cuh
@@ -6,7 +6,11 @@
 #include <cstdint>
 #include <cstdio>
 
+#if defined(USE_ROCM)
+#define cudaUnspecified hipErrorUnknown
+#else
 #define cudaUnspecified cudaErrorApiFailureBase
+#endif
 
 // React to failure on return code != cudaSuccess