Skip to content

Commit 7d8ca43

Browse files
committed
atomicAdd for GPUs with CC < 7
1 parent 64f32bc commit 7d8ca43

File tree

2 files changed

+34
-0
lines changed

2 files changed

+34
-0
lines changed

exllama_ext/cuda_compat.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#ifndef _cuda_compat_h
2+
#define _cuda_compat_h
3+
4+
// atomicAdd for half types, to support CC < 7.x
5+
6+
__device__ __forceinline__ void atomicAdd_half(half* address, half val)
7+
{
8+
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
9+
unsigned int old = *address_as_ui;
10+
unsigned int assumed;
11+
12+
do
13+
{
14+
assumed = old;
15+
__half_raw hsum;
16+
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
17+
half tmpres = __hadd(hsum, val);
18+
hsum = __half_raw(tmpres);
19+
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
20+
old = atomicCAS(address_as_ui, assumed, old);
21+
}
22+
while (assumed != old);
23+
}
24+
25+
#ifdef __CUDA_ARCH__
26+
#if __CUDA_ARCH__ < 700
27+
28+
__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
29+
30+
#endif
31+
#endif
32+
33+
#endif

exllama_ext/q4v2_matmul.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "column_remap.h"
33
#include "util.h"
44
#include "matrix.h"
5+
#include "cuda_compat.h"
56

67
// Block size
78

0 commit comments

Comments
 (0)