From 48f025543668a33d892e8cffe88d871fdbd0c2fd Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 29 Jul 2024 20:04:22 +0000 Subject: [PATCH 1/4] Squash a few more warnings --- csrc/attention/attention_kernels.cu | 4 ++-- csrc/quantization/aqlm/gemm_kernels.cu | 2 -- csrc/quantization/fp8/amd/quant_utils.cuh | 2 ++ csrc/quantization/fp8/nvidia/quant_utils.cuh | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 875570a1e894..bcd170411e7c 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -706,7 +706,7 @@ void paged_attention_v1_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. @@ -865,7 +865,7 @@ void paged_attention_v2_launcher( int kv_block_stride = key_cache.stride(0); int kv_head_stride = key_cache.stride(1); - int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); assert(head_size % thread_group_size == 0); // NOTE: alibi_slopes is optional. diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu index 8fb985680086..22da5e4f08a1 100644 --- a/csrc/quantization/aqlm/gemm_kernels.cu +++ b/csrc/quantization/aqlm/gemm_kernels.cu @@ -273,8 +273,6 @@ __global__ void Code2x8Dequant( } __syncthreads(); - float res = 0; - int iters = (prob_k / 8 - 1) / (8 * 32) + 1; while (iters--) { if (pred && a_gl_rd < a_gl_end) { diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index 35123d7fc65d..08c16e8bcdea 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); + return {}; // Squash missing return statement warning } template @@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index cde26dbda18c..b07cb5e89e28 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); + return {}; // Squash missing return statement warning } template @@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on From 072f6d440d80c3c455d4de1c5822145ad27e6789 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 29 Jul 2024 20:05:15 +0000 Subject: [PATCH 2/4] format --- csrc/quantization/fp8/amd/quant_utils.cuh | 4 ++-- csrc/quantization/fp8/nvidia/quant_utils.cuh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh index 08c16e8bcdea..eb66834222f3 100644 --- a/csrc/quantization/fp8/amd/quant_utils.cuh +++ b/csrc/quantization/fp8/amd/quant_utils.cuh @@ -526,7 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); - return {}; // Squash missing return statement warning + return {}; // Squash missing return statement warning } template @@ -537,7 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); - return {}; // Squash missing return statement warning + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh index b07cb5e89e28..e32684eaed24 100644 --- a/csrc/quantization/fp8/nvidia/quant_utils.cuh +++ b/csrc/quantization/fp8/nvidia/quant_utils.cuh @@ -508,7 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) { } #endif assert(false); - return {}; // Squash missing return statement warning + return {}; // Squash missing return statement warning } template @@ -521,7 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) { } #endif assert(false); - return {}; // Squash missing return statement warning + return {}; // Squash missing return statement warning } // The following macro is used to dispatch the conversion function based on From 1b9162d70fed610a7642fd636dddb8c7a2c480cb Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 30 Jul 2024 14:15:15 +0000 Subject: [PATCH 3/4] missed one --- csrc/quantization/squeezellm/quant_cuda_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu index 714907428a1a..91501b5a15e3 100644 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu @@ -203,7 +203,7 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, #endif mat.data_ptr(), #ifndef USE_ROCM - (half2*)mul.data(), (__half*)lookup_table.data_ptr(), + (half2*)mul.data_ptr(), (__half*)lookup_table.data_ptr(), #else (float2*)mul.data_ptr(), (__half*)lookup_table.data_ptr(), From 2dfe94700d6440bbe558e5fa6ef1147207967498 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 30 Jul 2024 14:20:38 +0000 Subject: [PATCH 4/4] format --- csrc/quantization/squeezellm/quant_cuda_kernel.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu index 91501b5a15e3..8ed918b3d7c2 100644 --- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu +++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu @@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, #endif mat.data_ptr(), #ifndef USE_ROCM - (half2*)mul.data_ptr(), (__half*)lookup_table.data_ptr(), + (half2*)mul.data_ptr(), + (__half*)lookup_table.data_ptr(), #else (float2*)mul.data_ptr(), (__half*)lookup_table.data_ptr(),