Skip to content

Commit dd0e0d9

Browse files
Fix LLAMA_CUDA_F16 check
1 parent efc4672 commit dd0e0d9

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,8 @@ if (LLAMA_CUBLAS)
280280
# 52 == lowest CUDA 12 standard
281281
# 60 == f16 CUDA intrinsics
282282
# 61 == integer CUDA intrinsics
283-
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
284-
if (LLAMA_CUDA_DMMV_F16)
283+
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
284+
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
285285
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
286286
else()
287287
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics

ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2877,7 +2877,7 @@ static __global__ void mul_mat_q(
28772877

28782878
__syncthreads();
28792879

2880-
#if __CUDA_ARCH__ >= 700 // TODO: actually test this with compute capability 7.X cards
2880+
#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
28812881
#pragma unroll
28822882
#endif // __CUDA_ARCH__ >= 700
28832883
for (int k = 0; k < WARP_SIZE/vdr; ++k) {

0 commit comments

Comments
 (0)