CUDA: fix build error from ambiguous __half conversions in conv2d (#15690)

qnixsynapse · web-flow · commit b66df9d9c942 · 2025-09-01T06:55:06.000+05:30
* CUDA: fix build error from ambiguous __half conversions in conv2d

Building conv2d with half precision failed because `__half` defines
multiple implicit conversion operators (to float, int, short, etc.),
causing ambiguous overload resolution when multiplying with float.

Introduce a templated `to_float` helper that explicitly converts
`__half` via `__half2float`, while passing through float unchanged.
Use this helper in conv2d accumulation to ensure unambiguous and
correct promotion to float.

Fixes some build errors with half-precision kernels on CUDA.

ggml-ci

* CUDA: Replace custom to_float helper with unified ggml_cuda_cast and add half‑&gt;float conversion

* CUDA: Add missing convert.cuh header

* CUDA: remove unnecessary extension in ggml_cuda_cast

* CUDA: Address review comment, remove second type template argument
diff --git a/ggml/src/ggml-cuda/conv2d.cu b/ggml/src/ggml-cuda/conv2d.cu
@@ -1,4 +1,5 @@
 #include "conv2d.cuh"
+#include "convert.cuh"
 
 struct conv_params {
     const int64_t IW, IH;
@@ -94,8 +95,8 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,
                 const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);
 
                 const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];
-                const float kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
-                acc += (input_val * kernel_val);
+                const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];
+                acc += (input_val * ggml_cuda_cast<float>(kernel_val));
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#include "conv2d.cuh"`
	`2`	`+#include "convert.cuh"`
`2`	`3`
`3`	`4`	`struct conv_params {`
`4`	`5`	`const int64_t IW, IH;`
`@@ -94,8 +95,8 @@ static __global__ void conv2d_kernel(const float * __restrict__ input,`
`94`	`95`	`const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X);`
`95`	`96`
`96`	`97`	`const float input_val = input[Layout::input_index(n, c_in, in_y, in_x, P)];`
`97`		`- const float kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];`
`98`		`- acc += (input_val * kernel_val);`
	`98`	`+ const T kernel_val = kernel[Layout::kernel_index(c_out, c_in, ky, kx, P)];`
	`99`	`+ acc += (input_val * ggml_cuda_cast<float>(kernel_val));`
`99`	`100`	`}`
`100`	`101`	`}`
`101`	`102`	`}`