From 433c4f505004afb0e60416f3c1514b2438f7c587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Tue, 26 Dec 2023 17:37:46 +0200
Subject: [PATCH 01/13] Least squares quantization

---
 ggml-quants.c | 600 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 357 insertions(+), 243 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index a15a240487084..aa793ffc5624c 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
+#include <stdio.h>
 
 #ifdef __ARM_NEON
 
@@ -424,6 +425,138 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
+static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int qk, float *min, float *d) {
+    // Least squares fits `d * q + m = x` for d and m.
+    float qs = 0.0f;
+    float qs2 = 0.0f;
+    float xs = 0.0f;
+    float xq = 0.0f;
+    float minx = x[0];
+    float maxx = x[0];
+    for (int i = 0; i < qk; i++) {
+        float qf = q[i];
+        qs += qf;
+        qs2 += qf*qf;
+        xs += x[i];
+        xq += x[i]*qf;
+        if (x[i] < minx) minx = x[i];
+        if (x[i] > maxx) maxx = x[i];
+    }
+    float denom = qs*qs - qs2*qk;
+    if (minx == maxx) {
+        *min = x[0];
+        *d = 0.0f;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (qs*xq - qs2*xs) / denom;
+        *d = (qs*xs - qk*xq) / denom;
+    }
+}
+
+static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) {
+    // Least squares fits `d * q = x` for d.
+    float qs2 = 0.0f;
+    float xq = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        qs2 += q[i]*q[i];
+        xq += x[i]*q[i];
+    }
+    if (qs2 == 0.0f) {
+        return 0.0f;
+    }
+    return xq / qs2;
+}
+
+static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
+    // Least squares fits `d * q = x` for d.
+    float qs2 = 0.0f;
+    float xq = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        qs2 += q[i]*q[i];
+        xq += x[i]*q[i];
+    }
+    if (qs2 == 0.0f) {
+        return 0.0f;
+    }
+    return xq / qs2;
+}
+
+static void lstsq_q_k(const float * restrict q, const float * restrict x, const float * restrict s, int bs, float *min, float *d) {
+    // Least squares fits `d * q - s * m = x` for d and m.
+    float s2 = 0.0f;
+    float qs = 0.0f;
+    float q2 = 0.0f;
+    float sx = 0.0f;
+    float qx = 0.0f;
+    for (int i = 0; i < QK_K; i++) {
+        s2 += s[i/bs]*s[i/bs];
+        qs += q[i]*s[i/bs];
+        q2 += q[i]*q[i];
+        sx += s[i/bs]*x[i];
+        qx += q[i]*x[i];
+    }
+    float denom = qs*qs - q2*s2;
+    if (s2 == 0.0f) {
+        // All s are zero.
+        *min = 0.0f;
+        *d = qx / q2;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (q2*sx - qs*qx) / denom;
+        *d = (qs*sx - qx*s2) / denom;
+    }
+}
+
+static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * restrict q, float * scale) {
+    // Least squares fits `d * q = x` for d with unsigned q.
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = fabsf(x[l]);
+        if (v > max) max = v;
+    }
+
+    float d  = max / ((1 << bits) - 1);
+    *scale = d;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = x[l]*id;
+
+        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+
+        q[l] = xi0;
+    }
+    *scale = lstsq_q_0_u8(q, x, n);
+}
+
+static void quantize_1(const float * restrict x, int n, int bits, uint8_t * restrict q, float * m, float * scale) {
+    float min = FLT_MAX;
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = x[l];
+
+        if (v < min) min = v;
+        if (v > max) max = v;
+    }
+
+    float d  = (max - min) / ((1 << bits) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = (x[l] - min)*id;
+
+        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+
+        q[l] = xi0;
+    }
+
+    lstsq_q_1(q, x, n, m, scale);
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     static const int qk = QK4_0;
@@ -474,32 +607,18 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
+        uint8_t q[qk];
+        float min;
+        float scale;
+        quantize_1(&x[i*qk], qk, 4, q, &min, &scale);
 
-            if (v < min) min = v;
-            if (v > max) max = v;
+        for (int j = 0; j < qk/2; ++j) {
+            y[i].qs[j]  = q[j];
+            y[i].qs[j] |= q[j + qk/2] << 4;
         }
 
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].d = GGML_FP32_TO_FP16(scale);
         y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
     }
 }
 
@@ -563,30 +682,16 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
+        uint8_t q[qk];
+        float min;
+        float scale;
+        quantize_1(&x[i*qk], qk, 5, q, &min, &scale);
 
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+            const uint8_t xi0 = q[j];
+            const uint8_t xi1 = q[j + qk/2];
 
             y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
@@ -595,6 +700,9 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
             qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
         }
 
+        y[i].d = GGML_FP32_TO_FP16(scale);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
     }
 }
@@ -1318,130 +1426,6 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) min = 0;
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff * diff;
-        float w = weights[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights[i];
-            sum_l += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff * diff;
-                float w = weights[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
 #if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
     if (j < 4) {
@@ -1460,8 +1444,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    uint8_t Laux[16];
-    float   weights[16];
     float mins[QK_K/16];
     float scales[QK_K/16];
 
@@ -1469,24 +1451,43 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
 
     for (int i = 0; i < nb; i++) {
         float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
-            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
+        float max_min = FLT_MAX;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                max_min = -FLT_MAX;
+                break;
             }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+        }
+
+redo_pos2:
+
+        for (int j = 0; j < QK_K/16; j++) {
+            uint8_t q[QK_K/16];
+            quantize_1(&x[16*j], 16, 2, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[16*j], 16, 2, q, &scales[j]);
+            }
+            if (scales[j] > max_scale) {
+                max_scale = scales[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
             }
         }
 
-        if (max_scale > 0) {
+        int all_zero_lm = 1;
+        if (max_scale != 0) {
             float iscale = q4scale/max_scale;
             for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*scales[j]);
+                int l = MAX(0, MIN(63, nearest_int(iscale*scales[j])));
+                if (l != 0) all_zero_lm = 0;
                 y[i].scales[j] = l;
             }
             y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
@@ -1494,27 +1495,52 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
             for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
-        if (max_min > 0) {
+        if (max_min != 0) {
             float iscale = q4scale/max_min;
             for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*mins[j]);
+                int l = MAX(0, MIN(63, nearest_int(iscale*mins[j])));
                 y[i].scales[j] |= (l << 4);
             }
             y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
         } else {
             y[i].dmin = GGML_FP32_TO_FP16(0.f);
         }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos2\n");
+            goto redo_pos2;
+        } else if (all_zero_lm) {
+            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
+        }
+
+
+        float q_fit[QK_K];
+        float q_m[QK_K/16];
+
         for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
+            uint8_t sc = y[i].scales[j] & 0xF;
+            uint8_t m = y[i].scales[j] >> 4;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = (y[i].scales[j] >> 4);
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int((x[16*j + ii] + dm)/d);
-                l = MAX(0, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[16*j + ii] + dm)/d);
+                    l = MAX(0, MIN(3, l));
+                }
                 L[16*j + ii] = l;
+                q_fit[16*j + ii] = sc * l;
             }
         }
 
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 16, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -1634,17 +1660,19 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
         }
 
         int8_t sc;
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
             sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
             float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-4, MIN(3, l));
+                }
                 L[16*j + ii] = l + 4;
+                q_fit[16*j + ii] = l * sc;
             }
         }
 #else
@@ -1664,20 +1692,24 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
             }
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
             float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-4, MIN(3, l));
+                }
                 L[16*j + ii] = l + 4;
+                q_fit[16*j + ii] = l * (s - 8);
             }
         }
 #endif
 
+        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
+
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
         int m = 0;
@@ -1812,40 +1844,55 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    uint8_t Laux[32];
-    float   weights[32];
     float mins[QK_K/32];
     float scales[QK_K/32];
 
     for (int i = 0; i < nb; i++) {
-
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                break;
+            }
+        }
+
+redo_pos4:
+
+        for (int j = 0; j < QK_K/32; j++) {
+            uint8_t q[QK_K/32];
+            quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]);
             }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+
+            if (j == 0 || scales[j] > max_scale) {
+                max_scale = scales[j];
+            }
+            if (j == 0) {
+                max_min = mins[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
             }
         }
 
 #if QK_K == 256
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale;
+        float inv_min   = max_min  == 0.0f ? 0.0f : 63.f/max_min;
+        int all_zero_lm = 1;
         for (int j = 0; j < QK_K/32; ++j) {
             uint8_t ls = nearest_int(inv_scale*scales[j]);
             uint8_t lm = nearest_int(inv_min*mins[j]);
             ls = MIN(63, ls);
             lm = MIN(63, lm);
+            if (lm != 0) all_zero_lm = 0;
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -1855,21 +1902,43 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos4\n");
+            goto redo_pos4;
+        } else if (all_zero_lm) {
+            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
+        }
+
         y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
         y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+        float q_fit[QK_K];
+        float q_m[QK_K/32];
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
             const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
             const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = m;
             for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[32*j + ii] + dm)/d);
+                    l = MAX(0, MIN(15, l));
+                }
                 L[32*j + ii] = l;
+                q_fit[32*j + ii] = sc * l;
             }
         }
+
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
 #else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
@@ -1980,8 +2049,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     uint8_t L[QK_K];
     float mins[QK_K/32];
     float scales[QK_K/32];
-    float weights[32];
-    uint8_t Laux[32];
 #else
     int8_t L[QK_K];
     float scales[QK_K/16];
@@ -1993,30 +2060,50 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                break;
+            }
+        }
+
+redo_pos:
+
+        for (int j = 0; j < QK_K/32; j++) {
+            uint8_t q[QK_K/32];
+            quantize_1(&x[32*j], 32, 5, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[32*j], 32, 5, q, &scales[j]);
+            }
+
+            if (j == 0 || scales[j] > max_scale) {
+                max_scale = scales[j];
             }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+
+            if (j == 0) {
+                max_min = mins[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
             }
         }
 
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+
+        float inv_scale = max_scale == 0.0f ? 0.f : 63.f/max_scale;
+        float inv_min   = max_min == 0.0f ? 0.f : 63.f/max_min;
+        int all_zero_lm = 1;
         for (int j = 0; j < QK_K/32; ++j) {
             uint8_t ls = nearest_int(inv_scale*scales[j]);
             uint8_t lm = nearest_int(inv_min*mins[j]);
             ls = MIN(63, ls);
             lm = MIN(63, lm);
+            if (lm != 0) all_zero_lm = 0;
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -2026,22 +2113,45 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
+
+        if (all_positive) {
+            //printf("all_pos: %f %f %d\n", max_scale, max_min, all_zero_lm);
+        }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos\n");
+            goto redo_pos;
+        }
+
         y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
         y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+        float q_fit[QK_K];
+        float q_m[QK_K/32];
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
             const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
             const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = m;
             for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[32*j + ii] + dm)/d);
+                    l = MAX(0, MIN(31, l));
+                }
                 L[32*j + ii] = l;
+                q_fit[32*j + ii] = sc * l;
             }
         }
 
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
         uint8_t * restrict qh = y[i].qh;
         uint8_t * restrict ql = y[i].qs;
         memset(qh, 0, QK_K/8);
@@ -2216,18 +2326,22 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
         }
 
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-32, MIN(31, l));
+                }
                 L[16*j + ii] = l + 32;
+                q_fit[16*j + ii] = l * y[i].scales[j];
             }
         }
 
+        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
+
         uint8_t * restrict ql = y[i].ql;
         uint8_t * restrict qh = y[i].qh;
 #if QK_K == 256

From 0b6207ef61f529b3e5e0340c6e07b33b370c4044 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Wed, 27 Dec 2023 07:11:02 +0200
Subject: [PATCH 02/13] Better k quantization

---
 ggml-quants.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/ggml-quants.c b/ggml-quants.c
index aa793ffc5624c..06481048f5ce5 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1892,6 +1892,35 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             uint8_t lm = nearest_int(inv_min*mins[j]);
             ls = MIN(63, ls);
             lm = MIN(63, lm);
+            float rms = 0.0f;
+            float rms2 = 0.0f;
+            float rms3 = 0.0f;
+            uint8_t lm2 = MIN(63, MAX(0, lm - 1));
+            uint8_t lm3 = MIN(63, MAX(0, lm + 1));
+            const float d1 = max_scale / 63.0f;
+            const float dmin1 = max_min / 63.0f;
+            for (int ii = 0; ii < 32; ii++) {
+                const float d = d1 * ls;
+                const float dm1 = dmin1 * lm;
+                const float dm2 = dmin1 * lm2;
+                const float dm3 = dmin1 * lm3;
+                if (!d) continue;
+                int l1 = nearest_int((x[32*j + ii] + dm1)/d);
+                l1 = MAX(0, MIN(15, l1));
+                int l2 = nearest_int((x[32*j + ii] + dm2)/d);
+                l2 = MAX(0, MIN(15, l2));
+                int l3 = nearest_int((x[32*j + ii] + dm3)/d);
+                l3 = MAX(0, MIN(15, l3));
+                rms += ((d*l1 - dm1) - x[32*j + ii]) * ((d*l1 - dm1) - x[32*j + ii]);
+                rms2 += ((d*l2 - dm2) - x[32*j + ii]) * ((d*l2 - dm2) - x[32*j + ii]);
+                rms3 += ((d*l3 - dm3) - x[32*j + ii]) * ((d*l3 - dm3) - x[32*j + ii]);
+            }
+            if (rms2 < rms && rms2 < rms3) {
+                lm = lm2;
+            }
+            if (rms3 < rms && rms3 < rms2) {
+                lm = lm3;
+            }
             if (lm != 0) all_zero_lm = 0;
             if (j < 4) {
                 y[i].scales[j] = ls;

From d1af0a3a945fd8300718ddacea786508376831e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Wed, 27 Dec 2023 09:44:38 +0200
Subject: [PATCH 03/13] Quantization loop

---
 ggml-quants.c | 109 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 42 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 06481048f5ce5..0c3b0d42df6d6 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -498,7 +498,7 @@ static void lstsq_q_k(const float * restrict q, const float * restrict x, const
         qx += q[i]*x[i];
     }
     float denom = qs*qs - q2*s2;
-    if (s2 == 0.0f) {
+    if (s2 == 0.0f && q2 != 0.0f) {
         // All s are zero.
         *min = 0.0f;
         *d = qx / q2;
@@ -1859,8 +1859,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 
-redo_pos4:
-
         for (int j = 0; j < QK_K/32; j++) {
             uint8_t q[QK_K/32];
             quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]);
@@ -1883,45 +1881,57 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 
+        int ql_loop = 0;
+quant_loop: ;
 #if QK_K == 256
         float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale;
         float inv_min   = max_min  == 0.0f ? 0.0f : 63.f/max_min;
-        int all_zero_lm = 1;
         for (int j = 0; j < QK_K/32; ++j) {
             uint8_t ls = nearest_int(inv_scale*scales[j]);
             uint8_t lm = nearest_int(inv_min*mins[j]);
+            uint8_t best_lm = lm;
+            uint8_t best_ls = ls;
             ls = MIN(63, ls);
             lm = MIN(63, lm);
-            float rms = 0.0f;
-            float rms2 = 0.0f;
-            float rms3 = 0.0f;
-            uint8_t lm2 = MIN(63, MAX(0, lm - 1));
-            uint8_t lm3 = MIN(63, MAX(0, lm + 1));
+            float best_rms = FLT_MAX;
             const float d1 = max_scale / 63.0f;
             const float dmin1 = max_min / 63.0f;
-            for (int ii = 0; ii < 32; ii++) {
-                const float d = d1 * ls;
-                const float dm1 = dmin1 * lm;
-                const float dm2 = dmin1 * lm2;
-                const float dm3 = dmin1 * lm3;
-                if (!d) continue;
-                int l1 = nearest_int((x[32*j + ii] + dm1)/d);
-                l1 = MAX(0, MIN(15, l1));
-                int l2 = nearest_int((x[32*j + ii] + dm2)/d);
-                l2 = MAX(0, MIN(15, l2));
-                int l3 = nearest_int((x[32*j + ii] + dm3)/d);
-                l3 = MAX(0, MIN(15, l3));
-                rms += ((d*l1 - dm1) - x[32*j + ii]) * ((d*l1 - dm1) - x[32*j + ii]);
-                rms2 += ((d*l2 - dm2) - x[32*j + ii]) * ((d*l2 - dm2) - x[32*j + ii]);
-                rms3 += ((d*l3 - dm3) - x[32*j + ii]) * ((d*l3 - dm3) - x[32*j + ii]);
-            }
-            if (rms2 < rms && rms2 < rms3) {
-                lm = lm2;
-            }
-            if (rms3 < rms && rms3 < rms2) {
-                lm = lm3;
+            int limit = 1;
+            if (ql_loop) limit = 4;
+            for (int lst = MAX(0, ls-limit); lst <= MIN(63, ls+limit); lst++) {
+                for (int lmt = MAX(0, lm-limit); lmt <= MIN(63, lm+limit); lmt++) {
+                    float rms = 0.0f;
+                    for (int ii = 0; ii < 32; ii++) {
+                        const float d = d1 * lst;
+                        const float dm1 = dmin1 * lmt;
+                        int l1 = 0;
+                        if (d) {
+                            l1 = nearest_int((x[32*j + ii] + dm1)/d);
+                            l1 = MAX(0, MIN(15, l1));
+                        }
+                        float e = ((d*l1 - dm1) - x[32*j + ii]);
+                        rms += e*e;
+                    }
+                    if (rms < best_rms) {
+                        best_lm = lmt;
+                        best_ls = lst;
+                        best_rms = rms;
+                    }
+                }
             }
-            if (lm != 0) all_zero_lm = 0;
+            //if (lm != best_lm) {
+            //    printf("best %d, orig %d\n", best_lm, lm);
+            //}
+            lm = best_lm;
+            ls = best_ls;
+            //if (rms2 < rms && rms2 < rms3) {
+            //    printf("rms2 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
+            //    lm = lm2;
+            //}
+            //if (rms3 < rms && rms3 < rms2) {
+            //    printf("rms3 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
+            //    lm = lm3;
+            //}
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -1932,13 +1942,14 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 
-        if (all_zero_lm && !all_positive) {
-            all_positive = 1;
-            //printf("**********red_pos4\n");
-            goto redo_pos4;
-        } else if (all_zero_lm) {
-            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
-        }
+        //if (all_zero_lm && !all_positive && !ql_loop) {
+        //    all_positive = 1;
+        //    //printf("**********red_pos4\n");
+        //    goto redo_pos4;
+        //}
+        //} else if (all_zero_lm) {
+        //    //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
+        //}
 
         y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
         y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
@@ -1962,12 +1973,26 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 
-        float min;
-        float d;
-        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
-        y[i].d = GGML_FP32_TO_FP16(d);
+        //printf("%d orig: %f %f, ", ql_loop, max_min, max_scale);
+        float min, scale;
+        lstsq_q_k(q_fit, x, q_m, 32, &min, &scale);
+        if (min != min) {
+            printf("min nan\n");
+        }
+        if (scale != scale) {
+            printf("scale nan\n");
+        }
+        //printf("fit: %f %f\n", max_min, max_scale);
+        y[i].d = GGML_FP32_TO_FP16(scale);
         y[i].dmin = GGML_FP32_TO_FP16(min);
+        //printf("%f %f, %f %f\n", max_min, min * 63.0f, max_scale, scale * 63.0f);
+        max_scale = GGML_FP16_TO_FP32(y[i].d) * 63.0f;
+        max_min = GGML_FP16_TO_FP32(y[i].dmin) * 63.0f;
 
+        ql_loop++;
+        if (ql_loop == 1) {
+            goto quant_loop;
+        }
 #else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;

From d6313d83857c3e481a70c5bd7b371c284d19ce34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Wed, 27 Dec 2023 12:57:43 +0200
Subject: [PATCH 04/13] Refactor

---
 ggml-quants.c | 271 +++++++++++++++++++++++++-------------------------
 1 file changed, 135 insertions(+), 136 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 0c3b0d42df6d6..a6b6c6a7bbd8b 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1303,6 +1303,135 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_size, int * block_scales, int * block_mins, uint8_t * L, ggml_fp16_t * block_scale, ggml_fp16_t * block_min) {
+    float max_scale = 0.0f;
+    float max_min = 0.0f;
+
+    // If all the weight are positive we can invert the sign of min.
+    // Otherwise blocks with all positive weights need to be quantized with zero
+    // min, because min scale is unsigned.
+    int all_positive = 1;
+    for (int j = 0; j < QK_K; j++) {
+        if (x[j] < 0.0f) {
+            all_positive = 0;
+            break;
+        }
+    }
+
+    float scales[QK_K];
+    float mins[QK_K];
+
+    for (int j = 0; j < QK_K/block_size; j++) {
+        uint8_t q[QK_K/block_size];
+        // First find least squares solution for min and scale for each block.
+        quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]);
+        // Flip the sign because quantize_1 assumes that min is added, but min
+        // is subtracted in k-quants.
+        mins[j] = -mins[j];
+        if (!all_positive && mins[j] < 0) {
+            // All weights are positive in this block, but some blocks have
+            // negative weights. Find new least squares scale with zero min.
+            mins[j] = 0.0f;
+            quantize_1_0min(&x[block_size*j], block_size, bits, q, &scales[j]);
+        }
+        if (scales[j] > max_scale) {
+            max_scale = scales[j];
+        }
+        if (j == 0) {
+            max_min = mins[j];
+        } else if (!all_positive && mins[j] > max_min) {
+            max_min = mins[j];
+        } else if (all_positive && mins[j] < max_min) {
+            max_min = mins[j];
+        }
+    }
+
+    int max_group = (1 << scale_bits) - 1;
+
+    // Increasing passes would decrease RMS error by miniscule amount with
+    // drawback of taking a lot more time.
+    for(int pass = 0; pass < 2; pass++) {
+        float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale;
+        float inv_min   = max_min == 0.0f ? 0.0f : max_group/max_min;
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            uint8_t best_lm = lm;
+            uint8_t best_ls = ls;
+            ls = MIN(max_group, ls);
+            lm = MIN(max_group, lm);
+            float best_rms = FLT_MAX;
+            const float d1 = max_scale / max_group;
+            const float dmin1 = max_min / max_group;
+            int limit = 1;
+            // Increase limit for minor RMS error decrease while increasing the
+            // quantization run time.
+            if (pass > 0) limit = 8;
+            // Due to quantization the best ls and lm might not be the nearest
+            // to the ones obtained by the round to nearest.
+            // Loop through few nearby choices and choose lm and ls that
+            // minimize RMS error.
+            for (int lst = MAX(0, ls-limit); lst <= MIN(max_group, ls+limit); lst++) {
+                for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) {
+                    float rms = 0.0f;
+                    for (int ii = 0; ii < block_size; ii++) {
+                        const float d = d1 * lst;
+                        const float dm1 = dmin1 * lmt;
+                        int l1 = 0;
+                        if (d) {
+                            l1 = nearest_int((x[block_size*j + ii] + dm1)/d);
+                            l1 = MAX(0, MIN((1 << bits) - 1, l1));
+                        }
+                        const float e = (d*l1 - dm1) - x[block_size*j + ii];
+                        rms += e*e;
+                    }
+                    if (rms < best_rms) {
+                        best_lm = lmt;
+                        best_ls = lst;
+                        best_rms = rms;
+                    }
+                }
+            }
+            block_scales[j] = best_ls;
+            block_mins[j] = best_lm;
+        }
+
+        float block_d = max_scale/max_group;
+        float block_dmin = max_min/max_group;
+        float q_fit[QK_K];
+        float q_m[QK_K];
+
+        // Quantize elements and populate arrays needed for least squares fit.
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            const float d = block_d * block_scales[j];
+            const float dm = block_dmin * block_mins[j];
+            q_m[j] = block_mins[j];
+            for (int ii = 0; ii < block_size; ++ii) {
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[block_size*j + ii] + dm)/d);
+                    l = MAX(0, MIN((1 << bits) - 1, l));
+                }
+                L[block_size*j + ii] = l;
+                q_fit[block_size*j + ii] = block_scales[j] * l;
+            }
+        }
+
+        // Least squares fit min and scale.
+        float min, scale;
+        lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
+        // Check for nans.
+        assert(min == min);
+        assert(scale == scale);
+        // Quantize to fp16 for the next pass.
+        max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
+        max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
+
+        *block_scale = GGML_FP32_TO_FP16(scale);
+        *block_min = GGML_FP32_TO_FP16(min);
+    }
+}
+
 static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
     float max = 0;
     float amax = 0;
@@ -1844,94 +1973,16 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
 
     for (int i = 0; i < nb; i++) {
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-
-        int all_positive = 1;
-        for (int j = 0; j < QK_K; j++) {
-            if (x[j] < 0.0f) {
-                all_positive = 0;
-                break;
-            }
-        }
-
-        for (int j = 0; j < QK_K/32; j++) {
-            uint8_t q[QK_K/32];
-            quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]);
-            mins[j] = -mins[j];
-            if ((!all_positive) && (mins[j] < 0)) {
-                mins[j] = 0.0f;
-                quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]);
-            }
-
-            if (j == 0 || scales[j] > max_scale) {
-                max_scale = scales[j];
-            }
-            if (j == 0) {
-                max_min = mins[j];
-            }
-            if (!all_positive && mins[j] > max_min) {
-                max_min = mins[j];
-            } else if (all_positive && mins[j] < max_min) {
-                max_min = mins[j];
-            }
-        }
-
-        int ql_loop = 0;
-quant_loop: ;
 #if QK_K == 256
-        float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale;
-        float inv_min   = max_min  == 0.0f ? 0.0f : 63.f/max_min;
+        int block_scales[QK_K/32];
+        int block_mins[QK_K/32];
+        quantize_q_k_1(x, 4, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
+
         for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            uint8_t best_lm = lm;
-            uint8_t best_ls = ls;
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            float best_rms = FLT_MAX;
-            const float d1 = max_scale / 63.0f;
-            const float dmin1 = max_min / 63.0f;
-            int limit = 1;
-            if (ql_loop) limit = 4;
-            for (int lst = MAX(0, ls-limit); lst <= MIN(63, ls+limit); lst++) {
-                for (int lmt = MAX(0, lm-limit); lmt <= MIN(63, lm+limit); lmt++) {
-                    float rms = 0.0f;
-                    for (int ii = 0; ii < 32; ii++) {
-                        const float d = d1 * lst;
-                        const float dm1 = dmin1 * lmt;
-                        int l1 = 0;
-                        if (d) {
-                            l1 = nearest_int((x[32*j + ii] + dm1)/d);
-                            l1 = MAX(0, MIN(15, l1));
-                        }
-                        float e = ((d*l1 - dm1) - x[32*j + ii]);
-                        rms += e*e;
-                    }
-                    if (rms < best_rms) {
-                        best_lm = lmt;
-                        best_ls = lst;
-                        best_rms = rms;
-                    }
-                }
-            }
-            //if (lm != best_lm) {
-            //    printf("best %d, orig %d\n", best_lm, lm);
-            //}
-            lm = best_lm;
-            ls = best_ls;
-            //if (rms2 < rms && rms2 < rms3) {
-            //    printf("rms2 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
-            //    lm = lm2;
-            //}
-            //if (rms3 < rms && rms3 < rms2) {
-            //    printf("rms3 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
-            //    lm = lm3;
-            //}
+            int ls = block_scales[j];
+            int lm = block_mins[j];
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -1941,58 +1992,6 @@ quant_loop: ;
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
-
-        //if (all_zero_lm && !all_positive && !ql_loop) {
-        //    all_positive = 1;
-        //    //printf("**********red_pos4\n");
-        //    goto redo_pos4;
-        //}
-        //} else if (all_zero_lm) {
-        //    //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
-        //}
-
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-        float q_fit[QK_K];
-        float q_m[QK_K/32];
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            q_m[j] = m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int((x[32*j + ii] + dm)/d);
-                    l = MAX(0, MIN(15, l));
-                }
-                L[32*j + ii] = l;
-                q_fit[32*j + ii] = sc * l;
-            }
-        }
-
-        //printf("%d orig: %f %f, ", ql_loop, max_min, max_scale);
-        float min, scale;
-        lstsq_q_k(q_fit, x, q_m, 32, &min, &scale);
-        if (min != min) {
-            printf("min nan\n");
-        }
-        if (scale != scale) {
-            printf("scale nan\n");
-        }
-        //printf("fit: %f %f\n", max_min, max_scale);
-        y[i].d = GGML_FP32_TO_FP16(scale);
-        y[i].dmin = GGML_FP32_TO_FP16(min);
-        //printf("%f %f, %f %f\n", max_min, min * 63.0f, max_scale, scale * 63.0f);
-        max_scale = GGML_FP16_TO_FP32(y[i].d) * 63.0f;
-        max_min = GGML_FP16_TO_FP32(y[i].dmin) * 63.0f;
-
-        ql_loop++;
-        if (ql_loop == 1) {
-            goto quant_loop;
-        }
 #else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;

From f478136773f69a9a229eb8c8685f7c6e6cfe2732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Wed, 27 Dec 2023 15:13:23 +0200
Subject: [PATCH 05/13] Fix other than 4 quants

---
 ggml-quants.c | 239 ++++++++------------------------------------------
 1 file changed, 37 insertions(+), 202 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index a6b6c6a7bbd8b..9c1e3e0e424b4 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1349,7 +1349,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
     int max_group = (1 << scale_bits) - 1;
 
     // Increasing passes would decrease RMS error by miniscule amount with
-    // drawback of taking a lot more time.
+    // drawback of taking more time.
     for(int pass = 0; pass < 2; pass++) {
         float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale;
         float inv_min   = max_min == 0.0f ? 0.0f : max_group/max_min;
@@ -1399,7 +1399,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
         float block_d = max_scale/max_group;
         float block_dmin = max_min/max_group;
         float q_fit[QK_K];
-        float q_m[QK_K];
+        float q_m[QK_K/block_size];
 
         // Quantize elements and populate arrays needed for least squares fit.
         for (int j = 0; j < QK_K/block_size; ++j) {
@@ -1573,103 +1573,16 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    float mins[QK_K/16];
-    float scales[QK_K/16];
-
-    const float q4scale = 15.f;
 
     for (int i = 0; i < nb; i++) {
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = FLT_MAX;
-
-        int all_positive = 1;
-        for (int j = 0; j < QK_K; j++) {
-            if (x[j] < 0.0f) {
-                all_positive = 0;
-                max_min = -FLT_MAX;
-                break;
-            }
-        }
-
-redo_pos2:
-
-        for (int j = 0; j < QK_K/16; j++) {
-            uint8_t q[QK_K/16];
-            quantize_1(&x[16*j], 16, 2, q, &mins[j], &scales[j]);
-            mins[j] = -mins[j];
-            if ((!all_positive) && (mins[j] < 0)) {
-                mins[j] = 0.0f;
-                quantize_1_0min(&x[16*j], 16, 2, q, &scales[j]);
-            }
-            if (scales[j] > max_scale) {
-                max_scale = scales[j];
-            }
-            if (!all_positive && mins[j] > max_min) {
-                max_min = mins[j];
-            } else if (all_positive && mins[j] < max_min) {
-                max_min = mins[j];
-            }
-        }
-
-        int all_zero_lm = 1;
-        if (max_scale != 0) {
-            float iscale = q4scale/max_scale;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = MAX(0, MIN(63, nearest_int(iscale*scales[j])));
-                if (l != 0) all_zero_lm = 0;
-                y[i].scales[j] = l;
-            }
-            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
-        } else {
-            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-        }
-        if (max_min != 0) {
-            float iscale = q4scale/max_min;
-            for (int j = 0; j < QK_K/16; ++j) {
-                int l = MAX(0, MIN(63, nearest_int(iscale*mins[j])));
-                y[i].scales[j] |= (l << 4);
-            }
-            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
-        } else {
-            y[i].dmin = GGML_FP32_TO_FP16(0.f);
-        }
-
-        if (all_zero_lm && !all_positive) {
-            all_positive = 1;
-            //printf("**********red_pos2\n");
-            goto redo_pos2;
-        } else if (all_zero_lm) {
-            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
-        }
-
-
-        float q_fit[QK_K];
-        float q_m[QK_K/16];
+        int block_scales[QK_K/16];
+        int block_mins[QK_K/16];
+        quantize_q_k_1(x, 2, 4, 16, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
 
         for (int j = 0; j < QK_K/16; ++j) {
-            uint8_t sc = y[i].scales[j] & 0xF;
-            uint8_t m = y[i].scales[j] >> 4;
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            q_m[j] = (y[i].scales[j] >> 4);
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int((x[16*j + ii] + dm)/d);
-                    l = MAX(0, MIN(3, l));
-                }
-                L[16*j + ii] = l;
-                q_fit[16*j + ii] = sc * l;
-            }
+            y[i].scales[j] = (block_mins[j] << 4) | (block_scales[j]);
         }
 
-        float min;
-        float d;
-        lstsq_q_k(q_fit, x, q_m, 16, &min, &d);
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].dmin = GGML_FP32_TO_FP16(min);
-
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -2110,121 +2023,43 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     for (int i = 0; i < nb; i++) {
 
 #if QK_K == 256
-
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-
-        int all_positive = 1;
-        for (int j = 0; j < QK_K; j++) {
-            if (x[j] < 0.0f) {
-                all_positive = 0;
-                break;
-            }
-        }
-
-redo_pos:
-
-        for (int j = 0; j < QK_K/32; j++) {
-            uint8_t q[QK_K/32];
-            quantize_1(&x[32*j], 32, 5, q, &mins[j], &scales[j]);
-            mins[j] = -mins[j];
-            if ((!all_positive) && (mins[j] < 0)) {
-                mins[j] = 0.0f;
-                quantize_1_0min(&x[32*j], 32, 5, q, &scales[j]);
-            }
-
-            if (j == 0 || scales[j] > max_scale) {
-                max_scale = scales[j];
-            }
-
-            if (j == 0) {
-                max_min = mins[j];
-            }
-            if (!all_positive && mins[j] > max_min) {
-                max_min = mins[j];
-            } else if (all_positive && mins[j] < max_min) {
-                max_min = mins[j];
-            }
-        }
-
-
-        float inv_scale = max_scale == 0.0f ? 0.f : 63.f/max_scale;
-        float inv_min   = max_min == 0.0f ? 0.f : 63.f/max_min;
-        int all_zero_lm = 1;
-        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            if (lm != 0) all_zero_lm = 0;
-            if (j < 4) {
-                y[i].scales[j] = ls;
-                y[i].scales[j+4] = lm;
-            } else {
-                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-                y[i].scales[j-4] |= ((ls >> 4) << 6);
-                y[i].scales[j-0] |= ((lm >> 4) << 6);
-            }
-        }
-
-        if (all_positive) {
-            //printf("all_pos: %f %f %d\n", max_scale, max_min, all_zero_lm);
-        }
-
-        if (all_zero_lm && !all_positive) {
-            all_positive = 1;
-            //printf("**********red_pos\n");
-            goto redo_pos;
+    int block_scales[QK_K/32];
+    int block_mins[QK_K/32];
+    quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
+
+    for (int j = 0; j < QK_K/32; ++j) {
+        int ls = block_scales[j];
+        int lm = block_mins[j];
+        if (j < 4) {
+            y[i].scales[j] = ls;
+            y[i].scales[j+4] = lm;
+        } else {
+            y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+            y[i].scales[j-4] |= ((ls >> 4) << 6);
+            y[i].scales[j-0] |= ((lm >> 4) << 6);
         }
+    }
 
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-        float q_fit[QK_K];
-        float q_m[QK_K/32];
+    uint8_t * restrict qh = y[i].qh;
+    uint8_t * restrict ql = y[i].qs;
+    memset(qh, 0, QK_K/8);
 
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            q_m[j] = m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int((x[32*j + ii] + dm)/d);
-                    l = MAX(0, MIN(31, l));
-                }
-                L[32*j + ii] = l;
-                q_fit[32*j + ii] = sc * l;
+    uint8_t m1 = 1, m2 = 2;
+    for (int n = 0; n < QK_K; n += 64) {
+        for (int j = 0; j < 32; ++j) {
+            int l1 = L[n + j];
+            if (l1 > 15) {
+                l1 -= 16; qh[j] |= m1;
             }
-        }
-
-        float min;
-        float d;
-        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].dmin = GGML_FP32_TO_FP16(min);
-
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
-        memset(qh, 0, QK_K/8);
-
-        uint8_t m1 = 1, m2 = 2;
-        for (int n = 0; n < QK_K; n += 64) {
-            for (int j = 0; j < 32; ++j) {
-                int l1 = L[n + j];
-                if (l1 > 15) {
-                    l1 -= 16; qh[j] |= m1;
-                }
-                int l2 = L[n + j + 32];
-                if (l2 > 15) {
-                    l2 -= 16; qh[j] |= m2;
-                }
-                ql[j] = l1 | (l2 << 4);
+            int l2 = L[n + j + 32];
+            if (l2 > 15) {
+                l2 -= 16; qh[j] |= m2;
             }
-            m1 <<= 2; m2 <<= 2;
-            ql += 32;
+            ql[j] = l1 | (l2 << 4);
         }
+        m1 <<= 2; m2 <<= 2;
+        ql += 32;
+    }
 #else
         float max_scale = 0, amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {

From 3fafecae9eb642ab872ea95c63b53e0a240d9223 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Thu, 28 Dec 2023 15:55:12 +0200
Subject: [PATCH 06/13] Weighted least squares

---
 ggml-quants.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 9c1e3e0e424b4..a2054acd99733 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -433,16 +433,19 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int
     float xq = 0.0f;
     float minx = x[0];
     float maxx = x[0];
+    float q1 = 0.0f;
     for (int i = 0; i < qk; i++) {
         float qf = q[i];
-        qs += qf;
-        qs2 += qf*qf;
-        xs += x[i];
-        xq += x[i]*qf;
+        float w = fabsf(x[i]);
+        q1 += w;
+        qs += w*qf;
+        qs2 += w*qf*qf;
+        xs += w*x[i];
+        xq += w*x[i]*qf;
         if (x[i] < minx) minx = x[i];
         if (x[i] > maxx) maxx = x[i];
     }
-    float denom = qs*qs - qs2*qk;
+    float denom = qs*qs - qs2*q1;
     if (minx == maxx) {
         *min = x[0];
         *d = 0.0f;
@@ -451,7 +454,7 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int
         *d = 0.0f;
     } else {
         *min = (qs*xq - qs2*xs) / denom;
-        *d = (qs*xs - qk*xq) / denom;
+        *d = (qs*xs - q1*xq) / denom;
     }
 }
 
@@ -491,11 +494,12 @@ static void lstsq_q_k(const float * restrict q, const float * restrict x, const
     float sx = 0.0f;
     float qx = 0.0f;
     for (int i = 0; i < QK_K; i++) {
-        s2 += s[i/bs]*s[i/bs];
-        qs += q[i]*s[i/bs];
-        q2 += q[i]*q[i];
-        sx += s[i/bs]*x[i];
-        qx += q[i]*x[i];
+        float w = fabsf(x[i]);
+        s2 += w*s[i/bs]*s[i/bs];
+        qs += w*q[i]*s[i/bs];
+        q2 += w*q[i]*q[i];
+        sx += w*s[i/bs]*x[i];
+        qx += w*q[i]*x[i];
     }
     float denom = qs*qs - q2*s2;
     if (s2 == 0.0f && q2 != 0.0f) {

From 2a21bd8ab24ee563af4b4ecee3b47d064175a283 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Thu, 28 Dec 2023 17:13:39 +0200
Subject: [PATCH 07/13] Remove Q_K 0 type changes

---
 ggml-quants.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index a2054acd99733..61d9eeec19232 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1754,8 +1754,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
         }
 #endif
 
-        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
-
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
         int m = 0;
@@ -2232,8 +2230,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             }
         }
 
-        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
-
         uint8_t * restrict ql = y[i].ql;
         uint8_t * restrict qh = y[i].qh;
 #if QK_K == 256

From 8386034e08519c8817668b4651d64a353df0965d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Fri, 29 Dec 2023 18:26:29 +0200
Subject: [PATCH 08/13] Revert quants other than q4_k and q5_k

---
 ggml-quants.c | 234 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 188 insertions(+), 46 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 61d9eeec19232..eb7da29adf562 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,7 +5,6 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-#include <stdio.h>
 
 #ifdef __ARM_NEON
 
@@ -530,7 +529,7 @@ static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t *
     for (int l = 0; l < n; l++) {
         const float x0 = x[l]*id;
 
-        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+        const uint8_t xi0 = MAX(0, MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)));
 
         q[l] = xi0;
     }
@@ -611,18 +610,32 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        uint8_t q[qk];
-        float min;
-        float scale;
-        quantize_1(&x[i*qk], qk, 4, q, &min, &scale);
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
 
-        for (int j = 0; j < qk/2; ++j) {
-            y[i].qs[j]  = q[j];
-            y[i].qs[j] |= q[j + qk/2] << 4;
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
         }
 
-        y[i].d = GGML_FP32_TO_FP16(scale);
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
         y[i].m = GGML_FP32_TO_FP16(min);
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
     }
 }
 
@@ -686,16 +699,30 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        uint8_t q[qk];
-        float min;
-        float scale;
-        quantize_1(&x[i*qk], qk, 5, q, &min, &scale);
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
 
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xi0 = q[j];
-            const uint8_t xi1 = q[j + qk/2];
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
 
             y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
@@ -704,9 +731,6 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
             qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
         }
 
-        y[i].d = GGML_FP32_TO_FP16(scale);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
     }
 }
@@ -1559,6 +1583,87 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
+    for (int i = 1; i < n; ++i) {
+#endif
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
 #if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
     if (j < 4) {
@@ -1577,14 +1682,59 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float   weights[16];
+    float mins[QK_K/16];
+    float scales[QK_K/16];
+
+    const float q4scale = 15.f;
 
     for (int i = 0; i < nb; i++) {
-        int block_scales[QK_K/16];
-        int block_mins[QK_K/16];
-        quantize_q_k_1(x, 2, 4, 16, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
+            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
 
+        if (max_scale > 0) {
+            float iscale = q4scale/max_scale;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*scales[j]);
+                y[i].scales[j] = l;
+            }
+            y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
+        } else {
+            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+        }
+        if (max_min > 0) {
+            float iscale = q4scale/max_min;
+            for (int j = 0; j < QK_K/16; ++j) {
+                int l = nearest_int(iscale*mins[j]);
+                y[i].scales[j] |= (l << 4);
+            }
+            y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
+        } else {
+            y[i].dmin = GGML_FP32_TO_FP16(0.f);
+        }
         for (int j = 0; j < QK_K/16; ++j) {
-            y[i].scales[j] = (block_mins[j] << 4) | (block_scales[j]);
+            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
+            if (!d) continue;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int((x[16*j + ii] + dm)/d);
+                l = MAX(0, MIN(3, l));
+                L[16*j + ii] = l;
+            }
         }
 
 #if QK_K == 256
@@ -1706,19 +1856,17 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
         }
 
         int8_t sc;
-        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
             sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
             float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            if (!d) {
+                continue;
+            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int(x[16*j + ii]/d);
-                    l = MAX(-4, MIN(3, l));
-                }
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
                 L[16*j + ii] = l + 4;
-                q_fit[16*j + ii] = l * sc;
             }
         }
 #else
@@ -1738,18 +1886,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
             }
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
-        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
             float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
+            if (!d) {
+                continue;
+            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int(x[16*j + ii]/d);
-                    l = MAX(-4, MIN(3, l));
-                }
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-4, MIN(3, l));
                 L[16*j + ii] = l + 4;
-                q_fit[16*j + ii] = l * (s - 8);
             }
         }
 #endif
@@ -2015,8 +2161,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 
 #if QK_K == 256
     uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
 #else
     int8_t L[QK_K];
     float scales[QK_K/16];
@@ -2216,17 +2360,15 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
         }
 
-        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int(x[16*j + ii]/d);
-                    l = MAX(-32, MIN(31, l));
-                }
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
                 L[16*j + ii] = l + 32;
-                q_fit[16*j + ii] = l * y[i].scales[j];
             }
         }
 

From 5a02328d1f872e2c73bd5e5cbb69a9dc792dd547 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Sat, 30 Dec 2023 18:31:46 +0200
Subject: [PATCH 09/13] No second least squares pass

---
 ggml-quants.c | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index eb7da29adf562..c2628badfdeb9 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
+#include <stdio.h>
 
 #ifdef __ARM_NEON
 
@@ -457,20 +458,6 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int
     }
 }
 
-static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) {
-    // Least squares fits `d * q = x` for d.
-    float qs2 = 0.0f;
-    float xq = 0.0f;
-    for (int i = 0; i < qk; i++) {
-        qs2 += q[i]*q[i];
-        xq += x[i]*q[i];
-    }
-    if (qs2 == 0.0f) {
-        return 0.0f;
-    }
-    return xq / qs2;
-}
-
 static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
     // Least squares fits `d * q = x` for d.
     float qs2 = 0.0f;
@@ -1445,18 +1432,20 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
             }
         }
 
-        // Least squares fit min and scale.
-        float min, scale;
-        lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
-        // Check for nans.
-        assert(min == min);
-        assert(scale == scale);
-        // Quantize to fp16 for the next pass.
-        max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
-        max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
+        if (pass == 0) {
+            // Least squares fit min and scale.
+            float min, scale;
+            lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
+            // Check for nans.
+            assert(min == min);
+            assert(scale == scale);
+            // Quantize to fp16 for the next pass.
+            max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
+            max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
 
-        *block_scale = GGML_FP32_TO_FP16(scale);
-        *block_min = GGML_FP32_TO_FP16(min);
+            *block_scale = GGML_FP32_TO_FP16(scale);
+            *block_min = GGML_FP32_TO_FP16(min);
+        }
     }
 }
 

From 4b06507172c9fb912d7d9f778dbe3aa116f5fb41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Mon, 1 Jan 2024 13:59:42 +0200
Subject: [PATCH 10/13] Cleanup

---
 ggml-quants.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index c2628badfdeb9..c0724e94767e2 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1325,16 +1325,16 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
     // If all the weight are positive we can invert the sign of min.
     // Otherwise blocks with all positive weights need to be quantized with zero
     // min, because min scale is unsigned.
-    int all_positive = 1;
+    bool all_positive = true;
     for (int j = 0; j < QK_K; j++) {
         if (x[j] < 0.0f) {
-            all_positive = 0;
+            all_positive = false;
             break;
         }
     }
 
-    float scales[QK_K];
-    float mins[QK_K];
+    float scales[QK_K/block_size];
+    float mins[QK_K/block_size];
 
     for (int j = 0; j < QK_K/block_size; j++) {
         uint8_t q[QK_K/block_size];
@@ -1343,7 +1343,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
         // Flip the sign because quantize_1 assumes that min is added, but min
         // is subtracted in k-quants.
         mins[j] = -mins[j];
-        if (!all_positive && mins[j] < 0) {
+        if ((!all_positive && mins[j] < 0) || (all_positive && mins[j] > 0)) {
             // All weights are positive in this block, but some blocks have
             // negative weights. Find new least squares scale with zero min.
             mins[j] = 0.0f;
@@ -1366,18 +1366,18 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
     // Increasing passes would decrease RMS error by miniscule amount with
     // drawback of taking more time.
     for(int pass = 0; pass < 2; pass++) {
-        float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale;
-        float inv_min   = max_min == 0.0f ? 0.0f : max_group/max_min;
+        float inv_scale  = max_scale == 0.0f ? 0.0f : max_group/max_scale;
+        float inv_min    = max_min == 0.0f ? 0.0f : max_group/max_min;
+        float block_d    = max_scale/max_group;
+        float block_dmin = max_min/max_group;
         for (int j = 0; j < QK_K/block_size; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            uint8_t best_lm = lm;
-            uint8_t best_ls = ls;
+            uint8_t ls = MAX(0, nearest_int(inv_scale*scales[j]));
+            uint8_t lm = MAX(0, nearest_int(inv_min*mins[j]));
             ls = MIN(max_group, ls);
             lm = MIN(max_group, lm);
+            uint8_t best_lm = lm;
+            uint8_t best_ls = ls;
             float best_rms = FLT_MAX;
-            const float d1 = max_scale / max_group;
-            const float dmin1 = max_min / max_group;
             int limit = 1;
             // Increase limit for minor RMS error decrease while increasing the
             // quantization run time.
@@ -1390,14 +1390,14 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
                 for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) {
                     float rms = 0.0f;
                     for (int ii = 0; ii < block_size; ii++) {
-                        const float d = d1 * lst;
-                        const float dm1 = dmin1 * lmt;
+                        const float d = block_d * lst;
+                        const float dm = block_dmin * lmt;
                         int l1 = 0;
                         if (d) {
-                            l1 = nearest_int((x[block_size*j + ii] + dm1)/d);
+                            l1 = nearest_int((x[block_size*j + ii] + dm)/d);
                             l1 = MAX(0, MIN((1 << bits) - 1, l1));
                         }
-                        const float e = (d*l1 - dm1) - x[block_size*j + ii];
+                        const float e = (d*l1 - dm) - x[block_size*j + ii];
                         rms += e*e;
                     }
                     if (rms < best_rms) {
@@ -1411,8 +1411,6 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
             block_mins[j] = best_lm;
         }
 
-        float block_d = max_scale/max_group;
-        float block_dmin = max_min/max_group;
         float q_fit[QK_K];
         float q_m[QK_K/block_size];
 

From 7e03872f1eba9fece7f740625048df49aa71f23e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Tue, 2 Jan 2024 15:44:45 +0200
Subject: [PATCH 11/13] Fix QKK_64

---
 ggml-quants.c | 88 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 55 insertions(+), 33 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index c0724e94767e2..91cd2f0d917ed 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,7 +5,6 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-#include <stdio.h>
 
 #ifdef __ARM_NEON
 
@@ -2041,6 +2040,29 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
             }
         }
 #else
+        float weights[32];
+        float mins[QK_K/32];
+        float scales[QK_K/32];
+        uint8_t Laux[32];
+        float max_scale = 0; // as we are deducting the min, scales are always positive
+        float max_min = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
+            float scale = scales[j];
+            if (scale > max_scale) {
+                max_scale = scale;
+            }
+            float min = mins[j];
+            if (min > max_min) {
+                max_min = min;
+            }
+        }
+
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
         float inv_min   = max_min   > 0 ? s_factor/max_min   : 0.f;
@@ -2156,43 +2178,43 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     for (int i = 0; i < nb; i++) {
 
 #if QK_K == 256
-    int block_scales[QK_K/32];
-    int block_mins[QK_K/32];
-    quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
-
-    for (int j = 0; j < QK_K/32; ++j) {
-        int ls = block_scales[j];
-        int lm = block_mins[j];
-        if (j < 4) {
-            y[i].scales[j] = ls;
-            y[i].scales[j+4] = lm;
-        } else {
-            y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
-            y[i].scales[j-4] |= ((ls >> 4) << 6);
-            y[i].scales[j-0] |= ((lm >> 4) << 6);
+        int block_scales[QK_K/32];
+        int block_mins[QK_K/32];
+        quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
+
+        for (int j = 0; j < QK_K/32; ++j) {
+            int ls = block_scales[j];
+            int lm = block_mins[j];
+            if (j < 4) {
+                y[i].scales[j] = ls;
+                y[i].scales[j+4] = lm;
+            } else {
+                y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4);
+                y[i].scales[j-4] |= ((ls >> 4) << 6);
+                y[i].scales[j-0] |= ((lm >> 4) << 6);
+            }
         }
-    }
 
-    uint8_t * restrict qh = y[i].qh;
-    uint8_t * restrict ql = y[i].qs;
-    memset(qh, 0, QK_K/8);
+        uint8_t * restrict qh = y[i].qh;
+        uint8_t * restrict ql = y[i].qs;
+        memset(qh, 0, QK_K/8);
 
-    uint8_t m1 = 1, m2 = 2;
-    for (int n = 0; n < QK_K; n += 64) {
-        for (int j = 0; j < 32; ++j) {
-            int l1 = L[n + j];
-            if (l1 > 15) {
-                l1 -= 16; qh[j] |= m1;
-            }
-            int l2 = L[n + j + 32];
-            if (l2 > 15) {
-                l2 -= 16; qh[j] |= m2;
+        uint8_t m1 = 1, m2 = 2;
+        for (int n = 0; n < QK_K; n += 64) {
+            for (int j = 0; j < 32; ++j) {
+                int l1 = L[n + j];
+                if (l1 > 15) {
+                    l1 -= 16; qh[j] |= m1;
+                }
+                int l2 = L[n + j + 32];
+                if (l2 > 15) {
+                    l2 -= 16; qh[j] |= m2;
+                }
+                ql[j] = l1 | (l2 << 4);
             }
-            ql[j] = l1 | (l2 << 4);
+            m1 <<= 2; m2 <<= 2;
+            ql += 32;
         }
-        m1 <<= 2; m2 <<= 2;
-        ql += 32;
-    }
 #else
         float max_scale = 0, amax = 0;
         for (int j = 0; j < QK_K/16; ++j) {

From 6683eb496061c4d7b9055c55f94ae6f5f944272f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Tue, 2 Jan 2024 20:07:34 +0200
Subject: [PATCH 12/13] Fix buffer size

---
 ggml-quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 91cd2f0d917ed..aea9bae0407ac 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1336,7 +1336,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
     float mins[QK_K/block_size];
 
     for (int j = 0; j < QK_K/block_size; j++) {
-        uint8_t q[QK_K/block_size];
+        uint8_t q[block_size];
         // First find least squares solution for min and scale for each block.
         quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]);
         // Flip the sign because quantize_1 assumes that min is added, but min

From 9c11f61ff1d2fe1ab1007fcbc315ffdf966a6e27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Tue, 2 Jan 2024 20:18:48 +0200
Subject: [PATCH 13/13] Simpler array sizes for msvc

---
 ggml-quants.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index aea9bae0407ac..a21815dad7d18 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1332,11 +1332,11 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
         }
     }
 
-    float scales[QK_K/block_size];
-    float mins[QK_K/block_size];
+    float scales[QK_K];
+    float mins[QK_K];
 
     for (int j = 0; j < QK_K/block_size; j++) {
-        uint8_t q[block_size];
+        uint8_t q[QK_K];
         // First find least squares solution for min and scale for each block.
         quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]);
         // Flip the sign because quantize_1 assumes that min is added, but min
@@ -1411,7 +1411,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_
         }
 
         float q_fit[QK_K];
-        float q_m[QK_K/block_size];
+        float q_m[QK_K];
 
         // Quantize elements and populate arrays needed for least squares fit.
         for (int j = 0; j < QK_K/block_size; ++j) {