-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Closed
Labels
Description
GPU info:
QUALCOMM build : 7b26bdd942, Iab69c31769
Build Date : 08/28/23
Shader Compiler Version : E031.41.03.44
Local Branch :
Remote Branch : refs/tags/AU_LINUX_ANDROID_LA.VENDOR.13.2.0.11.00.00.855.659
Remote Branch : NONE
Reconstruct Branch : NOTHING
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I Build Config : S P 14.1.4 AArch64
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I Driver Path : /vendor/lib64/hw/vulkan.adreno.so
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I Driver Version : 0676.42
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I PFP : 0x01740158
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I ME : 0x00000000
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0 com.layla I Application Name : ggml-vulkan
Application Version : 0x00000001
Engine Name : (null)
Engine Version : 0x00000000
Api Version : 0x00402000
In the file: ggml_vk_generate_shaders.py:640
: dequant_q4_K_body
The following DOES NOT WORK:
const int y_idx = i * QUANT_K + 64 * il + n * ir;
const int qs_idx = 32*il + n * ir;
uint8_t sc;
uint8_t m;
if (is < 4) {
sc = uint8_t(data_a[i].scales[is] & 63);
m = uint8_t(data_a[i].scales[is + 4] & 63);
} else {
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
}
const FLOAT_TYPE d1 = dall * sc;
const FLOAT_TYPE m1 = dmin * m;
if (is < 4) {
sc = uint8_t(data_a[i].scales[is + 1] & 63);
m = uint8_t(data_a[i].scales[is + 5] & 63);
} else {
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
}
const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m;
[[unroll]] for (int l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
This crashes with the issue: Shader compilation failed for shaderType: 5
.
The workaround appears to be "tail-ing" the if-branches (i.e. duplicating the code so the branches do not converge inside the loop).
const int y_idx = i * QUANT_K + 64 * il + n * ir;
const int qs_idx = 32*il + n * ir;
uint8_t sc;
uint8_t m;
if (is < 4) {
sc = uint8_t(data_a[i].scales[is] & 63);
m = uint8_t(data_a[i].scales[is + 4] & 63);
const FLOAT_TYPE d1 = dall * sc;
const FLOAT_TYPE m1 = dmin * m;
if (is < 4) {
sc = uint8_t(data_a[i].scales[is + 1] & 63);
m = uint8_t(data_a[i].scales[is + 5] & 63);
const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m;
[[unroll]] for (int l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
}
} else {
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m;
[[unroll]] for (int l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
}
}
} else {
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
const FLOAT_TYPE d1 = dall * sc;
const FLOAT_TYPE m1 = dmin * m;
if (is < 4) {
sc = uint8_t(data_a[i].scales[is + 1] & 63);
m = uint8_t(data_a[i].scales[is + 5] & 63);
const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m;
[[unroll]] for (int l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
}
} else {
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
const FLOAT_TYPE d2 = dall * sc;
const FLOAT_TYPE m2 = dmin * m;
[[unroll]] for (int l = 0; l < n; ++l) {
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
}
}
}
This workaround is tested to succeed when compiling for the Adreno GPU in Samsung Galaxy S23 Ultra.
This seems to indicate a subtle bug in the adreno shader compiler? Does anyone know what's going on?
zwilch