atomicAdd for GPUs with CC < 7

turboderp · turboderp · commit 7d8ca43532a2 · 2023-05-22T22:32:30.000+02:00
diff --git a/exllama_ext/cuda_compat.h b/exllama_ext/cuda_compat.h
@@ -0,0 +1,33 @@
+#ifndef _cuda_compat_h
+#define _cuda_compat_h
+
+// atomicAdd for half types, to support CC < 7.x
+
+__device__ __forceinline__ void atomicAdd_half(half* address, half val)
+{
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do
+    {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    }
+    while (assumed != old);
+}
+
+#ifdef __CUDA_ARCH__
+#if __CUDA_ARCH__ < 700
+
+__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
+
+#endif
+#endif
+
+#endif
diff --git a/exllama_ext/q4v2_matmul.cu b/exllama_ext/q4v2_matmul.cu
@@ -2,6 +2,7 @@
 #include "column_remap.h"
 #include "util.h"
 #include "matrix.h"
+#include "cuda_compat.h"
 
 // Block size