Add ROCm support

ardfork · ardfork · commit 38e72f8e6e16 · 2023-05-30T13:10:03.000Z
diff --git a/cuda_ext.py b/cuda_ext.py
@@ -11,21 +11,30 @@
 library_dir = os.path.dirname(os.path.abspath(__file__))
 extension_name = "exllama_ext"
 
+if torch.version.hip:
+    # FIXME: To build, I had to comment "flags += ['-fno-gpu-rdc']" in torch/utils/cpp_extension.py.
+    # I am not sure if it's possible to find a way to build without editing that file.
+    # If building without gpu-rdc, build will error with "lld: error: undefined hidden symbol: __llvm_amdgcn_rcp_f16".
+    extra_cuda_cflags= ["-U__HIP_NO_HALF_CONVERSIONS__", "-fgpu-rdc"]
+else:
+    extra_cuda_cflags = []
+
 exllama_ext = load(
     name = extension_name,
     sources = [
         os.path.join(library_dir, "exllama_ext/cuda_buffers.cu"),
         os.path.join(library_dir, "exllama_ext/cpu_func/rep_penalty.cpp"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/column_remap.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/half_matmul.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/q4v2_matmul.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/q4v2_mlp.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/q4v2_recons.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/q4v2_sequential.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/rms_norm.cu"),
-        os.path.join(library_dir, "exllama_ext/cuda_func/rope.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/column_remap.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/half_matmul.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/q4v2_matmul.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/q4v2_mlp.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/q4v2_recons.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/q4v2_sequential.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/rms_norm.cu"),
+        os.path.join(library_dir, "exllama_ext/cu_func/rope.cu"),
         os.path.join(library_dir, "exllama_ext/exllama_ext.cpp")
     ],
+    extra_cuda_cflags = extra_cuda_cflags
     # verbose = True,
     # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]
 )
diff --git a/exllama_ext/cu_func/column_remap.cu b/exllama_ext/cu_func/column_remap.cu
diff --git a/exllama_ext/cu_func/column_remap.cuh b/exllama_ext/cu_func/column_remap.cuh
@@ -1,8 +1,14 @@
 #ifndef _column_remap_cuh
 #define _column_remap_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 cudaError_t column_remap_cuda
diff --git a/exllama_ext/cu_func/half_matmul.cu b/exllama_ext/cu_func/half_matmul.cu
@@ -102,7 +102,7 @@ cudaError_t half_matmul_cublas_cuda
     const half alpha = __float2half(1.0f);
     const half beta = __float2half(0.0f);
 
-    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, w, width, x, dim, &beta, out, width);
+    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, reinterpret_cast<const rocblas_half*>(&alpha), reinterpret_cast<const rocblas_half*>(w), width, reinterpret_cast<const rocblas_half*>(x), dim, reinterpret_cast<const rocblas_half*>(&beta), reinterpret_cast<rocblas_half*>(out), width);
 
 //     cudaDeviceSynchronize();
 //     _cuda_check(cudaGetLastError());
diff --git a/exllama_ext/cu_func/half_matmul.cuh b/exllama_ext/cu_func/half_matmul.cuh
@@ -1,10 +1,19 @@
 #ifndef _half_matmul_cuh
 #define _half_matmul_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <rocblas/rocblas.h>
+#include <ATen/hip/HIPContext.h>
+#define cudaError_t hipError_t
+#define cublasHandle_t rocblas_handle
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
-#include <cstdint>
 #include <ATen/cuda/CUDAContext.h>
+#endif
+#include <cstdint>
 
 cudaError_t half_matmul_cuda
 (
diff --git a/exllama_ext/cu_func/q4v2_matmul.cu b/exllama_ext/cu_func/q4v2_matmul.cu
diff --git a/exllama_ext/cu_func/q4v2_matmul.cuh b/exllama_ext/cu_func/q4v2_matmul.cuh
@@ -1,8 +1,14 @@
 #ifndef _q4v2_matmul_cuh
 #define _q4v2_matmul_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 #include <cstdio>
 
diff --git a/exllama_ext/cu_func/q4v2_mlp.cu b/exllama_ext/cu_func/q4v2_mlp.cu
diff --git a/exllama_ext/cu_func/q4v2_mlp.cuh b/exllama_ext/cu_func/q4v2_mlp.cuh
@@ -1,8 +1,14 @@
 #ifndef _q4v2_mlp_cuh
 #define _q4v2_mlp_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 cudaError_t q4v2_mlp_cuda
diff --git a/exllama_ext/cu_func/q4v2_recons.cu b/exllama_ext/cu_func/q4v2_recons.cu
diff --git a/exllama_ext/cu_func/q4v2_recons.cuh b/exllama_ext/cu_func/q4v2_recons.cuh
@@ -1,8 +1,14 @@
 #ifndef _q4v2_recons_cuh
 #define _q4v2_recons_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 cudaError_t q4v2_recons_cuda
diff --git a/exllama_ext/cu_func/q4v2_sequential.cu b/exllama_ext/cu_func/q4v2_sequential.cu
diff --git a/exllama_ext/cu_func/q4v2_sequential.cuh b/exllama_ext/cu_func/q4v2_sequential.cuh
@@ -1,8 +1,14 @@
 #ifndef _q4v2_sequential_cuh
 #define _q4v2_sequential_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 #include <cstdio>
 
diff --git a/exllama_ext/cu_func/rms_norm.cu b/exllama_ext/cu_func/rms_norm.cu
diff --git a/exllama_ext/cu_func/rms_norm.cuh b/exllama_ext/cu_func/rms_norm.cuh
@@ -1,8 +1,14 @@
 #ifndef _rms_norm_cuh
 #define _rms_norm_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 cudaError_t rms_norm_cuda
diff --git a/exllama_ext/cu_func/rope.cu b/exllama_ext/cu_func/rope.cu
diff --git a/exllama_ext/cu_func/rope.cuh b/exllama_ext/cu_func/rope.cuh
@@ -1,8 +1,14 @@
 #ifndef _rope_cuh
 #define _rope_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 
 cudaError_t rope_cuda
diff --git a/exllama_ext/cuda_buffers.cuh b/exllama_ext/cuda_buffers.cuh
@@ -1,8 +1,14 @@
 #ifndef _cuda_buffers_cuh
 #define _cuda_buffers_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaError_t hipError_t
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 #include <cstdint>
 #include <cstdio>
 
diff --git a/exllama_ext/cuda_compat.cuh b/exllama_ext/cuda_compat.cuh
@@ -41,8 +41,8 @@ __device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
 
 //
 
-#ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ < 700
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
 
 __device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
 __device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
diff --git a/exllama_ext/exllama_ext.cpp b/exllama_ext/exllama_ext.cpp
@@ -9,14 +9,14 @@
 #include "cpu_func/rep_penalty.h"
 
 #include "cuda_buffers.cuh"
-#include "cuda_func/column_remap.cuh"
-#include "cuda_func/half_matmul.cuh"
-#include "cuda_func/q4v2_matmul.cuh"
-#include "cuda_func/q4v2_mlp.cuh"
-#include "cuda_func/q4v2_recons.cuh"
-#include "cuda_func/q4v2_sequential.cuh"
-#include "cuda_func/rms_norm.cuh"
-#include "cuda_func/rope.cuh"
+#include "cu_func/column_remap.cuh"
+#include "cu_func/half_matmul.cuh"
+#include "cu_func/q4v2_matmul.cuh"
+#include "cu_func/q4v2_mlp.cuh"
+#include "cu_func/q4v2_recons.cuh"
+#include "cu_func/q4v2_sequential.cuh"
+#include "cu_func/rms_norm.cuh"
+#include "cu_func/rope.cuh"
 #include "util.cuh"
 
 // Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
diff --git a/exllama_ext/matrix.cuh b/exllama_ext/matrix.cuh
@@ -1,8 +1,13 @@
 #ifndef _matrix_cuh
 #define _matrix_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#endif
 
 class MatrixView_half
 {
diff --git a/exllama_ext/util.cuh b/exllama_ext/util.cuh
@@ -1,12 +1,25 @@
 #ifndef _util_cuh
 #define _util_cuh
 
+#if USE_ROCM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaMalloc hipMalloc
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaSuccess hipSuccess
+#define cudaUnspecified hipErrorUnknown
+#else
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#define cudaUnspecified cudaErrorApiFailureBase
+#endif
 #include <cstdint>
 #include <cstdio>
 
-#define cudaUnspecified cudaErrorApiFailureBase
 
 // React to failure on return code != cudaSuccess