Skip to content

Subtle Vulkan shader compilation bug when running on Adreno GPUs (Samsung Galaxy S23 Ultra) #5186

@l3utterfly

Description

@l3utterfly

GPU info:

QUALCOMM build          : 7b26bdd942, Iab69c31769
                                                                                                    Build Date              : 08/28/23
                                                                                                    Shader Compiler Version : E031.41.03.44
                                                                                                    Local Branch            : 
                                                                                                    Remote Branch           : refs/tags/AU_LINUX_ANDROID_LA.VENDOR.13.2.0.11.00.00.855.659
                                                                                                    Remote Branch           : NONE
                                                                                                    Reconstruct Branch      : NOTHING
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Build Config            : S P 14.1.4 AArch64
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Driver Path             : /vendor/lib64/hw/vulkan.adreno.so
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Driver Version          : 0676.42
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  PFP                     : 0x01740158
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  ME                      : 0x00000000
2024-01-29 16:01:57.278 11189-11514 AdrenoVK-0              com.layla                            I  Application Name    : ggml-vulkan
                                                                                                    Application Version : 0x00000001
                                                                                                    Engine Name         : (null)
                                                                                                    Engine Version      : 0x00000000
                                                                                                    Api Version         : 0x00402000

In the file: ggml_vk_generate_shaders.py:640: dequant_q4_K_body

The following DOES NOT WORK:

const int y_idx = i * QUANT_K + 64 * il + n * ir;
        const int qs_idx = 32*il + n * ir;

        uint8_t sc;
        uint8_t m;
        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is] & 63);
            m  = uint8_t(data_a[i].scales[is + 4] & 63);
        } else {
            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));
        }
        const FLOAT_TYPE d1 = dall * sc;
        const FLOAT_TYPE m1 = dmin * m;

        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is + 1] & 63);
            m  = uint8_t(data_a[i].scales[is + 5] & 63);
        } else {
            sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));
        }
        const FLOAT_TYPE d2 = dall * sc;
        const FLOAT_TYPE m2 = dmin * m;

        [[unroll]] for (int l = 0; l < n; ++l) {
            data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
            data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);

This crashes with the issue: Shader compilation failed for shaderType: 5.

The workaround appears to be "tail-ing" the if-branches (i.e. duplicating the code so the branches do not converge inside the loop).

const int y_idx = i * QUANT_K + 64 * il + n * ir;
        const int qs_idx = 32*il + n * ir;

        uint8_t sc;
        uint8_t m;
        if (is < 4) {
            sc = uint8_t(data_a[i].scales[is] & 63);
            m  = uint8_t(data_a[i].scales[is + 4] & 63);

            const FLOAT_TYPE d1 = dall * sc;
            const FLOAT_TYPE m1 = dmin * m;

            if (is < 4) {
                sc = uint8_t(data_a[i].scales[is + 1] & 63);
                m  = uint8_t(data_a[i].scales[is + 5] & 63);

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            } else {
                sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
                m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            }
        } else {
            sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
            m  = uint8_t((data_a[i].scales[is + 4] >>  4) | ((data_a[i].scales[is    ] >> 6) << 4));

            const FLOAT_TYPE d1 = dall * sc;
            const FLOAT_TYPE m1 = dmin * m;

            if (is < 4) {
                sc = uint8_t(data_a[i].scales[is + 1] & 63);
                m  = uint8_t(data_a[i].scales[is + 5] & 63);

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            } else {
                sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
                m  = uint8_t((data_a[i].scales[is + 5] >>  4) | ((data_a[i].scales[is + 1] >> 6) << 4));

                const FLOAT_TYPE d2 = dall * sc;
                const FLOAT_TYPE m2 = dmin * m;

                [[unroll]] for (int l = 0; l < n; ++l) {
                    data_b[y_idx + l     ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
                    data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >>  4) - m2);
                }
            }
        }

This workaround is tested to succeed when compiling for the Adreno GPU in Samsung Galaxy S23 Ultra.

This seems to indicate a subtle bug in the adreno shader compiler? Does anyone know what's going on?

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions