From 433c4f505004afb0e60416f3c1514b2438f7c587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Tue, 26 Dec 2023 17:37:46 +0200 Subject: [PATCH 01/13] Least squares quantization --- ggml-quants.c | 600 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 357 insertions(+), 243 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index a15a240487084..aa793ffc5624c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef __ARM_NEON @@ -424,6 +425,138 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif +static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int qk, float *min, float *d) { + // Least squares fits `d * q + m = x` for d and m. + float qs = 0.0f; + float qs2 = 0.0f; + float xs = 0.0f; + float xq = 0.0f; + float minx = x[0]; + float maxx = x[0]; + for (int i = 0; i < qk; i++) { + float qf = q[i]; + qs += qf; + qs2 += qf*qf; + xs += x[i]; + xq += x[i]*qf; + if (x[i] < minx) minx = x[i]; + if (x[i] > maxx) maxx = x[i]; + } + float denom = qs*qs - qs2*qk; + if (minx == maxx) { + *min = x[0]; + *d = 0.0f; + } else if (denom == 0.0f) { + *min = 0.0f; + *d = 0.0f; + } else { + *min = (qs*xq - qs2*xs) / denom; + *d = (qs*xs - qk*xq) / denom; + } +} + +static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) { + // Least squares fits `d * q = x` for d. + float qs2 = 0.0f; + float xq = 0.0f; + for (int i = 0; i < qk; i++) { + qs2 += q[i]*q[i]; + xq += x[i]*q[i]; + } + if (qs2 == 0.0f) { + return 0.0f; + } + return xq / qs2; +} + +static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) { + // Least squares fits `d * q = x` for d. + float qs2 = 0.0f; + float xq = 0.0f; + for (int i = 0; i < qk; i++) { + qs2 += q[i]*q[i]; + xq += x[i]*q[i]; + } + if (qs2 == 0.0f) { + return 0.0f; + } + return xq / qs2; +} + +static void lstsq_q_k(const float * restrict q, const float * restrict x, const float * restrict s, int bs, float *min, float *d) { + // Least squares fits `d * q - s * m = x` for d and m. + float s2 = 0.0f; + float qs = 0.0f; + float q2 = 0.0f; + float sx = 0.0f; + float qx = 0.0f; + for (int i = 0; i < QK_K; i++) { + s2 += s[i/bs]*s[i/bs]; + qs += q[i]*s[i/bs]; + q2 += q[i]*q[i]; + sx += s[i/bs]*x[i]; + qx += q[i]*x[i]; + } + float denom = qs*qs - q2*s2; + if (s2 == 0.0f) { + // All s are zero. + *min = 0.0f; + *d = qx / q2; + } else if (denom == 0.0f) { + *min = 0.0f; + *d = 0.0f; + } else { + *min = (q2*sx - qs*qx) / denom; + *d = (qs*sx - qx*s2) / denom; + } +} + +static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * restrict q, float * scale) { + // Least squares fits `d * q = x` for d with unsigned q. + float max = -FLT_MAX; + for (int l = 0; l < n; l++) { + const float v = fabsf(x[l]); + if (v > max) max = v; + } + + float d = max / ((1 << bits) - 1); + *scale = d; + const float id = d ? 1.0f/d : 0.0f; + + for (int l = 0; l < n; l++) { + const float x0 = x[l]*id; + + const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)); + + q[l] = xi0; + } + *scale = lstsq_q_0_u8(q, x, n); +} + +static void quantize_1(const float * restrict x, int n, int bits, uint8_t * restrict q, float * m, float * scale) { + float min = FLT_MAX; + float max = -FLT_MAX; + for (int l = 0; l < n; l++) { + const float v = x[l]; + + if (v < min) min = v; + if (v > max) max = v; + } + + float d = (max - min) / ((1 << bits) - 1); + const float id = d ? 1.0f/d : 0.0f; + + for (int l = 0; l < n; l++) { + const float x0 = (x[l] - min)*id; + + const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)); + + q[l] = xi0; + } + + lstsq_q_1(q, x, n, m, scale); +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { static const int qk = QK4_0; @@ -474,32 +607,18 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; + uint8_t q[qk]; + float min; + float scale; + quantize_1(&x[i*qk], qk, 4, q, &min, &scale); - if (v < min) min = v; - if (v > max) max = v; + for (int j = 0; j < qk/2; ++j) { + y[i].qs[j] = q[j]; + y[i].qs[j] |= q[j + qk/2] << 4; } - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(scale); y[i].m = GGML_FP32_TO_FP16(min); - - for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - y[i].qs[j] = xi0; - y[i].qs[j] |= xi1 << 4; - } } } @@ -563,30 +682,16 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < qk; j++) { - const float v = x[i*qk + j]; - - if (v < min) min = v; - if (v > max) max = v; - } - - const float d = (max - min) / ((1 << 5) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = GGML_FP32_TO_FP16(d); - y[i].m = GGML_FP32_TO_FP16(min); + uint8_t q[qk]; + float min; + float scale; + quantize_1(&x[i*qk], qk, 5, q, &min, &scale); uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const float x0 = (x[i*qk + 0 + j] - min)*id; - const float x1 = (x[i*qk + qk/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + const uint8_t xi0 = q[j]; + const uint8_t xi1 = q[j + qk/2]; y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); @@ -595,6 +700,9 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); } + y[i].d = GGML_FP32_TO_FP16(scale); + y[i].m = GGML_FP32_TO_FP16(min); + memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); } } @@ -1318,130 +1426,6 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, - int ntry, float alpha) { - float min = x[0]; - float max = x[0]; - for (int i = 1; i < n; ++i) { - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - } - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = 0; - return 0.f; - } - if (min > 0) min = 0; - float iscale = nmax/(max - min); - float scale = 1/iscale; - for (int itry = 0; itry < ntry; ++itry) { - float sumlx = 0; int suml2 = 0; - bool did_change = false; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - if (l != L[i]) { - L[i] = l; - did_change = true; - } - sumlx += (x[i] - min)*l; - suml2 += l*l; - } - scale = sumlx/suml2; - float sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] - scale*L[i]; - } - min = alpha*min + (1 - alpha)*sum/n; - if (min > 0) min = 0; - iscale = 1/scale; - if (!did_change) break; - } - *the_min = -min; - return scale; -} - -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, - float rmin, float rdelta, int nstep, bool use_mad) { - float min = x[0]; - float max = x[0]; - float sum_w = weights[0]; - float sum_x = sum_w * x[0]; -#ifdef HAVE_BUGGY_APPLE_LINKER - // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 - for (volatile int i = 1; i < n; ++i) { -#else - for (int i = 1; i < n; ++i) { -#endif - if (x[i] < min) min = x[i]; - if (x[i] > max) max = x[i]; - float w = weights[i]; - sum_w += w; - sum_x += w * x[i]; - } - if (min > 0) min = 0; - if (max == min) { - for (int i = 0; i < n; ++i) L[i] = 0; - *the_min = -min; - return 0.f; - } - float iscale = nmax/(max - min); - float scale = 1/iscale; - float best_mad = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - L[i] = MAX(0, MIN(nmax, l)); - float diff = scale * L[i] + min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - best_mad += w * diff; - } - if (nstep < 1) { - *the_min = -min; - return scale; - } - for (int is = 0; is <= nstep; ++is) { - iscale = (rmin + rdelta*is + nmax)/(max - min); - float sum_l = 0, sum_l2 = 0, sum_xl = 0; - for (int i = 0; i < n; ++i) { - int l = nearest_int(iscale*(x[i] - min)); - l = MAX(0, MIN(nmax, l)); - Laux[i] = l; - float w = weights[i]; - sum_l += w*l; - sum_l2 += w*l*l; - sum_xl += w*l*x[i]; - } - float D = sum_w * sum_l2 - sum_l * sum_l; - if (D > 0) { - float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; - float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; - if (this_min > 0) { - this_min = 0; - this_scale = sum_xl / sum_l2; - } - float mad = 0; - for (int i = 0; i < n; ++i) { - float diff = this_scale * Laux[i] + this_min - x[i]; - diff = use_mad ? fabsf(diff) : diff * diff; - float w = weights[i]; - mad += w * diff; - } - if (mad < best_mad) { - for (int i = 0; i < n; ++i) { - L[i] = Laux[i]; - } - best_mad = mad; - scale = this_scale; - min = this_min; - } - } - } - *the_min = -min; - return scale; -} - #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { @@ -1460,8 +1444,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; - uint8_t Laux[16]; - float weights[16]; float mins[QK_K/16]; float scales[QK_K/16]; @@ -1469,24 +1451,43 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict for (int i = 0; i < nb; i++) { float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - for (int j = 0; j < QK_K/16; ++j) { - for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); - scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; + float max_min = FLT_MAX; + + int all_positive = 1; + for (int j = 0; j < QK_K; j++) { + if (x[j] < 0.0f) { + all_positive = 0; + max_min = -FLT_MAX; + break; } - float min = mins[j]; - if (min > max_min) { - max_min = min; + } + +redo_pos2: + + for (int j = 0; j < QK_K/16; j++) { + uint8_t q[QK_K/16]; + quantize_1(&x[16*j], 16, 2, q, &mins[j], &scales[j]); + mins[j] = -mins[j]; + if ((!all_positive) && (mins[j] < 0)) { + mins[j] = 0.0f; + quantize_1_0min(&x[16*j], 16, 2, q, &scales[j]); + } + if (scales[j] > max_scale) { + max_scale = scales[j]; + } + if (!all_positive && mins[j] > max_min) { + max_min = mins[j]; + } else if (all_positive && mins[j] < max_min) { + max_min = mins[j]; } } - if (max_scale > 0) { + int all_zero_lm = 1; + if (max_scale != 0) { float iscale = q4scale/max_scale; for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*scales[j]); + int l = MAX(0, MIN(63, nearest_int(iscale*scales[j]))); + if (l != 0) all_zero_lm = 0; y[i].scales[j] = l; } y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale); @@ -1494,27 +1495,52 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; y[i].d = GGML_FP32_TO_FP16(0.f); } - if (max_min > 0) { + if (max_min != 0) { float iscale = q4scale/max_min; for (int j = 0; j < QK_K/16; ++j) { - int l = nearest_int(iscale*mins[j]); + int l = MAX(0, MIN(63, nearest_int(iscale*mins[j]))); y[i].scales[j] |= (l << 4); } y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale); } else { y[i].dmin = GGML_FP32_TO_FP16(0.f); } + + if (all_zero_lm && !all_positive) { + all_positive = 1; + //printf("**********red_pos2\n"); + goto redo_pos2; + } else if (all_zero_lm) { + //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); + } + + + float q_fit[QK_K]; + float q_m[QK_K/16]; + for (int j = 0; j < QK_K/16; ++j) { - const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF); - if (!d) continue; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4); + uint8_t sc = y[i].scales[j] & 0xF; + uint8_t m = y[i].scales[j] >> 4; + const float d = GGML_FP16_TO_FP32(y[i].d) * sc; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; + q_m[j] = (y[i].scales[j] >> 4); for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int((x[16*j + ii] + dm)/d); - l = MAX(0, MIN(3, l)); + int l = 0; + if (d) { + l = nearest_int((x[16*j + ii] + dm)/d); + l = MAX(0, MIN(3, l)); + } L[16*j + ii] = l; + q_fit[16*j + ii] = sc * l; } } + float min; + float d; + lstsq_q_k(q_fit, x, q_m, 16, &min, &d); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].dmin = GGML_FP32_TO_FP16(min); + #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -1634,17 +1660,19 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } int8_t sc; + float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) { - continue; - } for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); + int l = 0; + if (d) { + l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + } L[16*j + ii] = l + 4; + q_fit[16*j + ii] = l * sc; } } #else @@ -1664,20 +1692,24 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } y[i].d = GGML_FP32_TO_FP16(0.f); } + float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8); - if (!d) { - continue; - } for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); + int l = 0; + if (d) { + l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); + } L[16*j + ii] = l + 4; + q_fit[16*j + ii] = l * (s - 8); } } #endif + y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K)); + memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. int m = 0; @@ -1812,40 +1844,55 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; - uint8_t Laux[32]; - float weights[32]; float mins[QK_K/32]; float scales[QK_K/32]; for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; - for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; + + int all_positive = 1; + for (int j = 0; j < QK_K; j++) { + if (x[j] < 0.0f) { + all_positive = 0; + break; + } + } + +redo_pos4: + + for (int j = 0; j < QK_K/32; j++) { + uint8_t q[QK_K/32]; + quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]); + mins[j] = -mins[j]; + if ((!all_positive) && (mins[j] < 0)) { + mins[j] = 0.0f; + quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]); } - float min = mins[j]; - if (min > max_min) { - max_min = min; + + if (j == 0 || scales[j] > max_scale) { + max_scale = scales[j]; + } + if (j == 0) { + max_min = mins[j]; + } + if (!all_positive && mins[j] > max_min) { + max_min = mins[j]; + } else if (all_positive && mins[j] < max_min) { + max_min = mins[j]; } } #if QK_K == 256 - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale; + float inv_min = max_min == 0.0f ? 0.0f : 63.f/max_min; + int all_zero_lm = 1; for (int j = 0; j < QK_K/32; ++j) { uint8_t ls = nearest_int(inv_scale*scales[j]); uint8_t lm = nearest_int(inv_min*mins[j]); ls = MIN(63, ls); lm = MIN(63, lm); + if (lm != 0) all_zero_lm = 0; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -1855,21 +1902,43 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y[i].scales[j-0] |= ((lm >> 4) << 6); } } + + if (all_zero_lm && !all_positive) { + all_positive = 1; + //printf("**********red_pos4\n"); + goto redo_pos4; + } else if (all_zero_lm) { + //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); + } + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + float q_fit[QK_K]; + float q_m[QK_K/32]; uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { get_scale_min_k4(j, y[i].scales, &sc, &m); const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) continue; const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; + q_m[j] = m; for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(15, l)); + int l = 0; + if (d) { + l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(15, l)); + } L[32*j + ii] = l; + q_fit[32*j + ii] = sc * l; } } + + float min; + float d; + lstsq_q_k(q_fit, x, q_m, 32, &min, &d); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].dmin = GGML_FP32_TO_FP16(min); + #else const float s_factor = 15.f; float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; @@ -1980,8 +2049,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict uint8_t L[QK_K]; float mins[QK_K/32]; float scales[QK_K/32]; - float weights[32]; - uint8_t Laux[32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -1993,30 +2060,50 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict float max_scale = 0; // as we are deducting the min, scales are always positive float max_min = 0; - for (int j = 0; j < QK_K/32; ++j) { - //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); - float sum_x2 = 0; - for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; - float av_x = sqrtf(sum_x2/32); - for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); - scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false); - float scale = scales[j]; - if (scale > max_scale) { - max_scale = scale; + + int all_positive = 1; + for (int j = 0; j < QK_K; j++) { + if (x[j] < 0.0f) { + all_positive = 0; + break; + } + } + +redo_pos: + + for (int j = 0; j < QK_K/32; j++) { + uint8_t q[QK_K/32]; + quantize_1(&x[32*j], 32, 5, q, &mins[j], &scales[j]); + mins[j] = -mins[j]; + if ((!all_positive) && (mins[j] < 0)) { + mins[j] = 0.0f; + quantize_1_0min(&x[32*j], 32, 5, q, &scales[j]); + } + + if (j == 0 || scales[j] > max_scale) { + max_scale = scales[j]; } - float min = mins[j]; - if (min > max_min) { - max_min = min; + + if (j == 0) { + max_min = mins[j]; + } + if (!all_positive && mins[j] > max_min) { + max_min = mins[j]; + } else if (all_positive && mins[j] < max_min) { + max_min = mins[j]; } } - float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f; - float inv_min = max_min > 0 ? 63.f/max_min : 0.f; + + float inv_scale = max_scale == 0.0f ? 0.f : 63.f/max_scale; + float inv_min = max_min == 0.0f ? 0.f : 63.f/max_min; + int all_zero_lm = 1; for (int j = 0; j < QK_K/32; ++j) { uint8_t ls = nearest_int(inv_scale*scales[j]); uint8_t lm = nearest_int(inv_min*mins[j]); ls = MIN(63, ls); lm = MIN(63, lm); + if (lm != 0) all_zero_lm = 0; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -2026,22 +2113,45 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y[i].scales[j-0] |= ((lm >> 4) << 6); } } + + if (all_positive) { + //printf("all_pos: %f %f %d\n", max_scale, max_min, all_zero_lm); + } + + if (all_zero_lm && !all_positive) { + all_positive = 1; + //printf("**********red_pos\n"); + goto redo_pos; + } + y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); + float q_fit[QK_K]; + float q_m[QK_K/32]; uint8_t sc, m; for (int j = 0; j < QK_K/32; ++j) { get_scale_min_k4(j, y[i].scales, &sc, &m); const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - if (!d) continue; const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; + q_m[j] = m; for (int ii = 0; ii < 32; ++ii) { - int l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(31, l)); + int l = 0; + if (d) { + l = nearest_int((x[32*j + ii] + dm)/d); + l = MAX(0, MIN(31, l)); + } L[32*j + ii] = l; + q_fit[32*j + ii] = sc * l; } } + float min; + float d; + lstsq_q_k(q_fit, x, q_m, 32, &min, &d); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].dmin = GGML_FP32_TO_FP16(min); + uint8_t * restrict qh = y[i].qh; uint8_t * restrict ql = y[i].qs; memset(qh, 0, QK_K/8); @@ -2216,18 +2326,22 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); } + float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; - if (!d) { - continue; - } for (int ii = 0; ii < 16; ++ii) { - int l = nearest_int(x[16*j + ii]/d); - l = MAX(-32, MIN(31, l)); + int l = 0; + if (d) { + l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); + } L[16*j + ii] = l + 32; + q_fit[16*j + ii] = l * y[i].scales[j]; } } + y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K)); + uint8_t * restrict ql = y[i].ql; uint8_t * restrict qh = y[i].qh; #if QK_K == 256 From 0b6207ef61f529b3e5e0340c6e07b33b370c4044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Wed, 27 Dec 2023 07:11:02 +0200 Subject: [PATCH 02/13] Better k quantization --- ggml-quants.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ggml-quants.c b/ggml-quants.c index aa793ffc5624c..06481048f5ce5 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1892,6 +1892,35 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict uint8_t lm = nearest_int(inv_min*mins[j]); ls = MIN(63, ls); lm = MIN(63, lm); + float rms = 0.0f; + float rms2 = 0.0f; + float rms3 = 0.0f; + uint8_t lm2 = MIN(63, MAX(0, lm - 1)); + uint8_t lm3 = MIN(63, MAX(0, lm + 1)); + const float d1 = max_scale / 63.0f; + const float dmin1 = max_min / 63.0f; + for (int ii = 0; ii < 32; ii++) { + const float d = d1 * ls; + const float dm1 = dmin1 * lm; + const float dm2 = dmin1 * lm2; + const float dm3 = dmin1 * lm3; + if (!d) continue; + int l1 = nearest_int((x[32*j + ii] + dm1)/d); + l1 = MAX(0, MIN(15, l1)); + int l2 = nearest_int((x[32*j + ii] + dm2)/d); + l2 = MAX(0, MIN(15, l2)); + int l3 = nearest_int((x[32*j + ii] + dm3)/d); + l3 = MAX(0, MIN(15, l3)); + rms += ((d*l1 - dm1) - x[32*j + ii]) * ((d*l1 - dm1) - x[32*j + ii]); + rms2 += ((d*l2 - dm2) - x[32*j + ii]) * ((d*l2 - dm2) - x[32*j + ii]); + rms3 += ((d*l3 - dm3) - x[32*j + ii]) * ((d*l3 - dm3) - x[32*j + ii]); + } + if (rms2 < rms && rms2 < rms3) { + lm = lm2; + } + if (rms3 < rms && rms3 < rms2) { + lm = lm3; + } if (lm != 0) all_zero_lm = 0; if (j < 4) { y[i].scales[j] = ls; From d1af0a3a945fd8300718ddacea786508376831e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Wed, 27 Dec 2023 09:44:38 +0200 Subject: [PATCH 03/13] Quantization loop --- ggml-quants.c | 109 +++++++++++++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 06481048f5ce5..0c3b0d42df6d6 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -498,7 +498,7 @@ static void lstsq_q_k(const float * restrict q, const float * restrict x, const qx += q[i]*x[i]; } float denom = qs*qs - q2*s2; - if (s2 == 0.0f) { + if (s2 == 0.0f && q2 != 0.0f) { // All s are zero. *min = 0.0f; *d = qx / q2; @@ -1859,8 +1859,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -redo_pos4: - for (int j = 0; j < QK_K/32; j++) { uint8_t q[QK_K/32]; quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]); @@ -1883,45 +1881,57 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } + int ql_loop = 0; +quant_loop: ; #if QK_K == 256 float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale; float inv_min = max_min == 0.0f ? 0.0f : 63.f/max_min; - int all_zero_lm = 1; for (int j = 0; j < QK_K/32; ++j) { uint8_t ls = nearest_int(inv_scale*scales[j]); uint8_t lm = nearest_int(inv_min*mins[j]); + uint8_t best_lm = lm; + uint8_t best_ls = ls; ls = MIN(63, ls); lm = MIN(63, lm); - float rms = 0.0f; - float rms2 = 0.0f; - float rms3 = 0.0f; - uint8_t lm2 = MIN(63, MAX(0, lm - 1)); - uint8_t lm3 = MIN(63, MAX(0, lm + 1)); + float best_rms = FLT_MAX; const float d1 = max_scale / 63.0f; const float dmin1 = max_min / 63.0f; - for (int ii = 0; ii < 32; ii++) { - const float d = d1 * ls; - const float dm1 = dmin1 * lm; - const float dm2 = dmin1 * lm2; - const float dm3 = dmin1 * lm3; - if (!d) continue; - int l1 = nearest_int((x[32*j + ii] + dm1)/d); - l1 = MAX(0, MIN(15, l1)); - int l2 = nearest_int((x[32*j + ii] + dm2)/d); - l2 = MAX(0, MIN(15, l2)); - int l3 = nearest_int((x[32*j + ii] + dm3)/d); - l3 = MAX(0, MIN(15, l3)); - rms += ((d*l1 - dm1) - x[32*j + ii]) * ((d*l1 - dm1) - x[32*j + ii]); - rms2 += ((d*l2 - dm2) - x[32*j + ii]) * ((d*l2 - dm2) - x[32*j + ii]); - rms3 += ((d*l3 - dm3) - x[32*j + ii]) * ((d*l3 - dm3) - x[32*j + ii]); - } - if (rms2 < rms && rms2 < rms3) { - lm = lm2; - } - if (rms3 < rms && rms3 < rms2) { - lm = lm3; + int limit = 1; + if (ql_loop) limit = 4; + for (int lst = MAX(0, ls-limit); lst <= MIN(63, ls+limit); lst++) { + for (int lmt = MAX(0, lm-limit); lmt <= MIN(63, lm+limit); lmt++) { + float rms = 0.0f; + for (int ii = 0; ii < 32; ii++) { + const float d = d1 * lst; + const float dm1 = dmin1 * lmt; + int l1 = 0; + if (d) { + l1 = nearest_int((x[32*j + ii] + dm1)/d); + l1 = MAX(0, MIN(15, l1)); + } + float e = ((d*l1 - dm1) - x[32*j + ii]); + rms += e*e; + } + if (rms < best_rms) { + best_lm = lmt; + best_ls = lst; + best_rms = rms; + } + } } - if (lm != 0) all_zero_lm = 0; + //if (lm != best_lm) { + // printf("best %d, orig %d\n", best_lm, lm); + //} + lm = best_lm; + ls = best_ls; + //if (rms2 < rms && rms2 < rms3) { + // printf("rms2 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3); + // lm = lm2; + //} + //if (rms3 < rms && rms3 < rms2) { + // printf("rms3 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3); + // lm = lm3; + //} if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -1932,13 +1942,14 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } - if (all_zero_lm && !all_positive) { - all_positive = 1; - //printf("**********red_pos4\n"); - goto redo_pos4; - } else if (all_zero_lm) { - //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); - } + //if (all_zero_lm && !all_positive && !ql_loop) { + // all_positive = 1; + // //printf("**********red_pos4\n"); + // goto redo_pos4; + //} + //} else if (all_zero_lm) { + // //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); + //} y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); @@ -1962,12 +1973,26 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } - float min; - float d; - lstsq_q_k(q_fit, x, q_m, 32, &min, &d); - y[i].d = GGML_FP32_TO_FP16(d); + //printf("%d orig: %f %f, ", ql_loop, max_min, max_scale); + float min, scale; + lstsq_q_k(q_fit, x, q_m, 32, &min, &scale); + if (min != min) { + printf("min nan\n"); + } + if (scale != scale) { + printf("scale nan\n"); + } + //printf("fit: %f %f\n", max_min, max_scale); + y[i].d = GGML_FP32_TO_FP16(scale); y[i].dmin = GGML_FP32_TO_FP16(min); + //printf("%f %f, %f %f\n", max_min, min * 63.0f, max_scale, scale * 63.0f); + max_scale = GGML_FP16_TO_FP32(y[i].d) * 63.0f; + max_min = GGML_FP16_TO_FP32(y[i].dmin) * 63.0f; + ql_loop++; + if (ql_loop == 1) { + goto quant_loop; + } #else const float s_factor = 15.f; float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; From d6313d83857c3e481a70c5bd7b371c284d19ce34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Wed, 27 Dec 2023 12:57:43 +0200 Subject: [PATCH 04/13] Refactor --- ggml-quants.c | 271 +++++++++++++++++++++++++------------------------- 1 file changed, 135 insertions(+), 136 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 0c3b0d42df6d6..a6b6c6a7bbd8b 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1303,6 +1303,135 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } +static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_size, int * block_scales, int * block_mins, uint8_t * L, ggml_fp16_t * block_scale, ggml_fp16_t * block_min) { + float max_scale = 0.0f; + float max_min = 0.0f; + + // If all the weight are positive we can invert the sign of min. + // Otherwise blocks with all positive weights need to be quantized with zero + // min, because min scale is unsigned. + int all_positive = 1; + for (int j = 0; j < QK_K; j++) { + if (x[j] < 0.0f) { + all_positive = 0; + break; + } + } + + float scales[QK_K]; + float mins[QK_K]; + + for (int j = 0; j < QK_K/block_size; j++) { + uint8_t q[QK_K/block_size]; + // First find least squares solution for min and scale for each block. + quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]); + // Flip the sign because quantize_1 assumes that min is added, but min + // is subtracted in k-quants. + mins[j] = -mins[j]; + if (!all_positive && mins[j] < 0) { + // All weights are positive in this block, but some blocks have + // negative weights. Find new least squares scale with zero min. + mins[j] = 0.0f; + quantize_1_0min(&x[block_size*j], block_size, bits, q, &scales[j]); + } + if (scales[j] > max_scale) { + max_scale = scales[j]; + } + if (j == 0) { + max_min = mins[j]; + } else if (!all_positive && mins[j] > max_min) { + max_min = mins[j]; + } else if (all_positive && mins[j] < max_min) { + max_min = mins[j]; + } + } + + int max_group = (1 << scale_bits) - 1; + + // Increasing passes would decrease RMS error by miniscule amount with + // drawback of taking a lot more time. + for(int pass = 0; pass < 2; pass++) { + float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale; + float inv_min = max_min == 0.0f ? 0.0f : max_group/max_min; + for (int j = 0; j < QK_K/block_size; ++j) { + uint8_t ls = nearest_int(inv_scale*scales[j]); + uint8_t lm = nearest_int(inv_min*mins[j]); + uint8_t best_lm = lm; + uint8_t best_ls = ls; + ls = MIN(max_group, ls); + lm = MIN(max_group, lm); + float best_rms = FLT_MAX; + const float d1 = max_scale / max_group; + const float dmin1 = max_min / max_group; + int limit = 1; + // Increase limit for minor RMS error decrease while increasing the + // quantization run time. + if (pass > 0) limit = 8; + // Due to quantization the best ls and lm might not be the nearest + // to the ones obtained by the round to nearest. + // Loop through few nearby choices and choose lm and ls that + // minimize RMS error. + for (int lst = MAX(0, ls-limit); lst <= MIN(max_group, ls+limit); lst++) { + for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) { + float rms = 0.0f; + for (int ii = 0; ii < block_size; ii++) { + const float d = d1 * lst; + const float dm1 = dmin1 * lmt; + int l1 = 0; + if (d) { + l1 = nearest_int((x[block_size*j + ii] + dm1)/d); + l1 = MAX(0, MIN((1 << bits) - 1, l1)); + } + const float e = (d*l1 - dm1) - x[block_size*j + ii]; + rms += e*e; + } + if (rms < best_rms) { + best_lm = lmt; + best_ls = lst; + best_rms = rms; + } + } + } + block_scales[j] = best_ls; + block_mins[j] = best_lm; + } + + float block_d = max_scale/max_group; + float block_dmin = max_min/max_group; + float q_fit[QK_K]; + float q_m[QK_K]; + + // Quantize elements and populate arrays needed for least squares fit. + for (int j = 0; j < QK_K/block_size; ++j) { + const float d = block_d * block_scales[j]; + const float dm = block_dmin * block_mins[j]; + q_m[j] = block_mins[j]; + for (int ii = 0; ii < block_size; ++ii) { + int l = 0; + if (d) { + l = nearest_int((x[block_size*j + ii] + dm)/d); + l = MAX(0, MIN((1 << bits) - 1, l)); + } + L[block_size*j + ii] = l; + q_fit[block_size*j + ii] = block_scales[j] * l; + } + } + + // Least squares fit min and scale. + float min, scale; + lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale); + // Check for nans. + assert(min == min); + assert(scale == scale); + // Quantize to fp16 for the next pass. + max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group; + max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group; + + *block_scale = GGML_FP32_TO_FP16(scale); + *block_min = GGML_FP32_TO_FP16(min); + } +} + static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { float max = 0; float amax = 0; @@ -1844,94 +1973,16 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - - int all_positive = 1; - for (int j = 0; j < QK_K; j++) { - if (x[j] < 0.0f) { - all_positive = 0; - break; - } - } - - for (int j = 0; j < QK_K/32; j++) { - uint8_t q[QK_K/32]; - quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]); - mins[j] = -mins[j]; - if ((!all_positive) && (mins[j] < 0)) { - mins[j] = 0.0f; - quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]); - } - - if (j == 0 || scales[j] > max_scale) { - max_scale = scales[j]; - } - if (j == 0) { - max_min = mins[j]; - } - if (!all_positive && mins[j] > max_min) { - max_min = mins[j]; - } else if (all_positive && mins[j] < max_min) { - max_min = mins[j]; - } - } - - int ql_loop = 0; -quant_loop: ; #if QK_K == 256 - float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale; - float inv_min = max_min == 0.0f ? 0.0f : 63.f/max_min; + int block_scales[QK_K/32]; + int block_mins[QK_K/32]; + quantize_q_k_1(x, 4, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); + for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - uint8_t best_lm = lm; - uint8_t best_ls = ls; - ls = MIN(63, ls); - lm = MIN(63, lm); - float best_rms = FLT_MAX; - const float d1 = max_scale / 63.0f; - const float dmin1 = max_min / 63.0f; - int limit = 1; - if (ql_loop) limit = 4; - for (int lst = MAX(0, ls-limit); lst <= MIN(63, ls+limit); lst++) { - for (int lmt = MAX(0, lm-limit); lmt <= MIN(63, lm+limit); lmt++) { - float rms = 0.0f; - for (int ii = 0; ii < 32; ii++) { - const float d = d1 * lst; - const float dm1 = dmin1 * lmt; - int l1 = 0; - if (d) { - l1 = nearest_int((x[32*j + ii] + dm1)/d); - l1 = MAX(0, MIN(15, l1)); - } - float e = ((d*l1 - dm1) - x[32*j + ii]); - rms += e*e; - } - if (rms < best_rms) { - best_lm = lmt; - best_ls = lst; - best_rms = rms; - } - } - } - //if (lm != best_lm) { - // printf("best %d, orig %d\n", best_lm, lm); - //} - lm = best_lm; - ls = best_ls; - //if (rms2 < rms && rms2 < rms3) { - // printf("rms2 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3); - // lm = lm2; - //} - //if (rms3 < rms && rms3 < rms2) { - // printf("rms3 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3); - // lm = lm3; - //} + int ls = block_scales[j]; + int lm = block_mins[j]; if (j < 4) { y[i].scales[j] = ls; y[i].scales[j+4] = lm; @@ -1941,58 +1992,6 @@ quant_loop: ; y[i].scales[j-0] |= ((lm >> 4) << 6); } } - - //if (all_zero_lm && !all_positive && !ql_loop) { - // all_positive = 1; - // //printf("**********red_pos4\n"); - // goto redo_pos4; - //} - //} else if (all_zero_lm) { - // //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); - //} - - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); - float q_fit[QK_K]; - float q_m[QK_K/32]; - - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; - q_m[j] = m; - for (int ii = 0; ii < 32; ++ii) { - int l = 0; - if (d) { - l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(15, l)); - } - L[32*j + ii] = l; - q_fit[32*j + ii] = sc * l; - } - } - - //printf("%d orig: %f %f, ", ql_loop, max_min, max_scale); - float min, scale; - lstsq_q_k(q_fit, x, q_m, 32, &min, &scale); - if (min != min) { - printf("min nan\n"); - } - if (scale != scale) { - printf("scale nan\n"); - } - //printf("fit: %f %f\n", max_min, max_scale); - y[i].d = GGML_FP32_TO_FP16(scale); - y[i].dmin = GGML_FP32_TO_FP16(min); - //printf("%f %f, %f %f\n", max_min, min * 63.0f, max_scale, scale * 63.0f); - max_scale = GGML_FP16_TO_FP32(y[i].d) * 63.0f; - max_min = GGML_FP16_TO_FP32(y[i].dmin) * 63.0f; - - ql_loop++; - if (ql_loop == 1) { - goto quant_loop; - } #else const float s_factor = 15.f; float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; From f478136773f69a9a229eb8c8685f7c6e6cfe2732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Wed, 27 Dec 2023 15:13:23 +0200 Subject: [PATCH 05/13] Fix other than 4 quants --- ggml-quants.c | 239 ++++++++------------------------------------------ 1 file changed, 37 insertions(+), 202 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index a6b6c6a7bbd8b..9c1e3e0e424b4 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1349,7 +1349,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ int max_group = (1 << scale_bits) - 1; // Increasing passes would decrease RMS error by miniscule amount with - // drawback of taking a lot more time. + // drawback of taking more time. for(int pass = 0; pass < 2; pass++) { float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale; float inv_min = max_min == 0.0f ? 0.0f : max_group/max_min; @@ -1399,7 +1399,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ float block_d = max_scale/max_group; float block_dmin = max_min/max_group; float q_fit[QK_K]; - float q_m[QK_K]; + float q_m[QK_K/block_size]; // Quantize elements and populate arrays needed for least squares fit. for (int j = 0; j < QK_K/block_size; ++j) { @@ -1573,103 +1573,16 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; - float mins[QK_K/16]; - float scales[QK_K/16]; - - const float q4scale = 15.f; for (int i = 0; i < nb; i++) { - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = FLT_MAX; - - int all_positive = 1; - for (int j = 0; j < QK_K; j++) { - if (x[j] < 0.0f) { - all_positive = 0; - max_min = -FLT_MAX; - break; - } - } - -redo_pos2: - - for (int j = 0; j < QK_K/16; j++) { - uint8_t q[QK_K/16]; - quantize_1(&x[16*j], 16, 2, q, &mins[j], &scales[j]); - mins[j] = -mins[j]; - if ((!all_positive) && (mins[j] < 0)) { - mins[j] = 0.0f; - quantize_1_0min(&x[16*j], 16, 2, q, &scales[j]); - } - if (scales[j] > max_scale) { - max_scale = scales[j]; - } - if (!all_positive && mins[j] > max_min) { - max_min = mins[j]; - } else if (all_positive && mins[j] < max_min) { - max_min = mins[j]; - } - } - - int all_zero_lm = 1; - if (max_scale != 0) { - float iscale = q4scale/max_scale; - for (int j = 0; j < QK_K/16; ++j) { - int l = MAX(0, MIN(63, nearest_int(iscale*scales[j]))); - if (l != 0) all_zero_lm = 0; - y[i].scales[j] = l; - } - y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale); - } else { - for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; - y[i].d = GGML_FP32_TO_FP16(0.f); - } - if (max_min != 0) { - float iscale = q4scale/max_min; - for (int j = 0; j < QK_K/16; ++j) { - int l = MAX(0, MIN(63, nearest_int(iscale*mins[j]))); - y[i].scales[j] |= (l << 4); - } - y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale); - } else { - y[i].dmin = GGML_FP32_TO_FP16(0.f); - } - - if (all_zero_lm && !all_positive) { - all_positive = 1; - //printf("**********red_pos2\n"); - goto redo_pos2; - } else if (all_zero_lm) { - //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min); - } - - - float q_fit[QK_K]; - float q_m[QK_K/16]; + int block_scales[QK_K/16]; + int block_mins[QK_K/16]; + quantize_q_k_1(x, 2, 4, 16, block_scales, block_mins, L, &y[i].d, &y[i].dmin); for (int j = 0; j < QK_K/16; ++j) { - uint8_t sc = y[i].scales[j] & 0xF; - uint8_t m = y[i].scales[j] >> 4; - const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; - q_m[j] = (y[i].scales[j] >> 4); - for (int ii = 0; ii < 16; ++ii) { - int l = 0; - if (d) { - l = nearest_int((x[16*j + ii] + dm)/d); - l = MAX(0, MIN(3, l)); - } - L[16*j + ii] = l; - q_fit[16*j + ii] = sc * l; - } + y[i].scales[j] = (block_mins[j] << 4) | (block_scales[j]); } - float min; - float d; - lstsq_q_k(q_fit, x, q_m, 16, &min, &d); - y[i].d = GGML_FP32_TO_FP16(d); - y[i].dmin = GGML_FP32_TO_FP16(min); - #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -2110,121 +2023,43 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict for (int i = 0; i < nb; i++) { #if QK_K == 256 - - float max_scale = 0; // as we are deducting the min, scales are always positive - float max_min = 0; - - int all_positive = 1; - for (int j = 0; j < QK_K; j++) { - if (x[j] < 0.0f) { - all_positive = 0; - break; - } - } - -redo_pos: - - for (int j = 0; j < QK_K/32; j++) { - uint8_t q[QK_K/32]; - quantize_1(&x[32*j], 32, 5, q, &mins[j], &scales[j]); - mins[j] = -mins[j]; - if ((!all_positive) && (mins[j] < 0)) { - mins[j] = 0.0f; - quantize_1_0min(&x[32*j], 32, 5, q, &scales[j]); - } - - if (j == 0 || scales[j] > max_scale) { - max_scale = scales[j]; - } - - if (j == 0) { - max_min = mins[j]; - } - if (!all_positive && mins[j] > max_min) { - max_min = mins[j]; - } else if (all_positive && mins[j] < max_min) { - max_min = mins[j]; - } - } - - - float inv_scale = max_scale == 0.0f ? 0.f : 63.f/max_scale; - float inv_min = max_min == 0.0f ? 0.f : 63.f/max_min; - int all_zero_lm = 1; - for (int j = 0; j < QK_K/32; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - ls = MIN(63, ls); - lm = MIN(63, lm); - if (lm != 0) all_zero_lm = 0; - if (j < 4) { - y[i].scales[j] = ls; - y[i].scales[j+4] = lm; - } else { - y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); - y[i].scales[j-4] |= ((ls >> 4) << 6); - y[i].scales[j-0] |= ((lm >> 4) << 6); - } - } - - if (all_positive) { - //printf("all_pos: %f %f %d\n", max_scale, max_min, all_zero_lm); - } - - if (all_zero_lm && !all_positive) { - all_positive = 1; - //printf("**********red_pos\n"); - goto redo_pos; + int block_scales[QK_K/32]; + int block_mins[QK_K/32]; + quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); + + for (int j = 0; j < QK_K/32; ++j) { + int ls = block_scales[j]; + int lm = block_mins[j]; + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); } + } - y[i].d = GGML_FP32_TO_FP16(max_scale/63.f); - y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f); - float q_fit[QK_K]; - float q_m[QK_K/32]; + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); - uint8_t sc, m; - for (int j = 0; j < QK_K/32; ++j) { - get_scale_min_k4(j, y[i].scales, &sc, &m); - const float d = GGML_FP16_TO_FP32(y[i].d) * sc; - const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m; - q_m[j] = m; - for (int ii = 0; ii < 32; ++ii) { - int l = 0; - if (d) { - l = nearest_int((x[32*j + ii] + dm)/d); - l = MAX(0, MIN(31, l)); - } - L[32*j + ii] = l; - q_fit[32*j + ii] = sc * l; + uint8_t m1 = 1, m2 = 2; + for (int n = 0; n < QK_K; n += 64) { + for (int j = 0; j < 32; ++j) { + int l1 = L[n + j]; + if (l1 > 15) { + l1 -= 16; qh[j] |= m1; } - } - - float min; - float d; - lstsq_q_k(q_fit, x, q_m, 32, &min, &d); - y[i].d = GGML_FP32_TO_FP16(d); - y[i].dmin = GGML_FP32_TO_FP16(min); - - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); - - uint8_t m1 = 1, m2 = 2; - for (int n = 0; n < QK_K; n += 64) { - for (int j = 0; j < 32; ++j) { - int l1 = L[n + j]; - if (l1 > 15) { - l1 -= 16; qh[j] |= m1; - } - int l2 = L[n + j + 32]; - if (l2 > 15) { - l2 -= 16; qh[j] |= m2; - } - ql[j] = l1 | (l2 << 4); + int l2 = L[n + j + 32]; + if (l2 > 15) { + l2 -= 16; qh[j] |= m2; } - m1 <<= 2; m2 <<= 2; - ql += 32; + ql[j] = l1 | (l2 << 4); } + m1 <<= 2; m2 <<= 2; + ql += 32; + } #else float max_scale = 0, amax = 0; for (int j = 0; j < QK_K/16; ++j) { From 3fafecae9eb642ab872ea95c63b53e0a240d9223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Thu, 28 Dec 2023 15:55:12 +0200 Subject: [PATCH 06/13] Weighted least squares --- ggml-quants.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 9c1e3e0e424b4..a2054acd99733 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -433,16 +433,19 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int float xq = 0.0f; float minx = x[0]; float maxx = x[0]; + float q1 = 0.0f; for (int i = 0; i < qk; i++) { float qf = q[i]; - qs += qf; - qs2 += qf*qf; - xs += x[i]; - xq += x[i]*qf; + float w = fabsf(x[i]); + q1 += w; + qs += w*qf; + qs2 += w*qf*qf; + xs += w*x[i]; + xq += w*x[i]*qf; if (x[i] < minx) minx = x[i]; if (x[i] > maxx) maxx = x[i]; } - float denom = qs*qs - qs2*qk; + float denom = qs*qs - qs2*q1; if (minx == maxx) { *min = x[0]; *d = 0.0f; @@ -451,7 +454,7 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int *d = 0.0f; } else { *min = (qs*xq - qs2*xs) / denom; - *d = (qs*xs - qk*xq) / denom; + *d = (qs*xs - q1*xq) / denom; } } @@ -491,11 +494,12 @@ static void lstsq_q_k(const float * restrict q, const float * restrict x, const float sx = 0.0f; float qx = 0.0f; for (int i = 0; i < QK_K; i++) { - s2 += s[i/bs]*s[i/bs]; - qs += q[i]*s[i/bs]; - q2 += q[i]*q[i]; - sx += s[i/bs]*x[i]; - qx += q[i]*x[i]; + float w = fabsf(x[i]); + s2 += w*s[i/bs]*s[i/bs]; + qs += w*q[i]*s[i/bs]; + q2 += w*q[i]*q[i]; + sx += w*s[i/bs]*x[i]; + qx += w*q[i]*x[i]; } float denom = qs*qs - q2*s2; if (s2 == 0.0f && q2 != 0.0f) { From 2a21bd8ab24ee563af4b4ecee3b47d064175a283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Thu, 28 Dec 2023 17:13:39 +0200 Subject: [PATCH 07/13] Remove Q_K 0 type changes --- ggml-quants.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index a2054acd99733..61d9eeec19232 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1754,8 +1754,6 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #endif - y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K)); - memset(y[i].hmask, 0, QK_K/8); // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. int m = 0; @@ -2232,8 +2230,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } - y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K)); - uint8_t * restrict ql = y[i].ql; uint8_t * restrict qh = y[i].qh; #if QK_K == 256 From 8386034e08519c8817668b4651d64a353df0965d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Fri, 29 Dec 2023 18:26:29 +0200 Subject: [PATCH 08/13] Revert quants other than q4_k and q5_k --- ggml-quants.c | 234 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 188 insertions(+), 46 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 61d9eeec19232..eb7da29adf562 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,7 +5,6 @@ #include #include #include -#include #ifdef __ARM_NEON @@ -530,7 +529,7 @@ static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * for (int l = 0; l < n; l++) { const float x0 = x[l]*id; - const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f)); + const uint8_t xi0 = MAX(0, MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f))); q[l] = xi0; } @@ -611,18 +610,32 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - uint8_t q[qk]; - float min; - float scale; - quantize_1(&x[i*qk], qk, 4, q, &min, &scale); + float min = FLT_MAX; + float max = -FLT_MAX; - for (int j = 0; j < qk/2; ++j) { - y[i].qs[j] = q[j]; - y[i].qs[j] |= q[j + qk/2] << 4; + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; } - y[i].d = GGML_FP32_TO_FP16(scale); + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); y[i].m = GGML_FP32_TO_FP16(min); + + for (int j = 0; j < qk/2; ++j) { + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + y[i].qs[j] = xi0; + y[i].qs[j] |= xi1 << 4; + } } } @@ -686,16 +699,30 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - uint8_t q[qk]; - float min; - float scale; - quantize_1(&x[i*qk], qk, 5, q, &min, &scale); + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < qk; j++) { + const float v = x[i*qk + j]; + + if (v < min) min = v; + if (v > max) max = v; + } + + const float d = (max - min) / ((1 << 5) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); uint32_t qh = 0; for (int j = 0; j < qk/2; ++j) { - const uint8_t xi0 = q[j]; - const uint8_t xi1 = q[j + qk/2]; + const float x0 = (x[i*qk + 0 + j] - min)*id; + const float x1 = (x[i*qk + qk/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4); @@ -704,9 +731,6 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2); } - y[i].d = GGML_FP32_TO_FP16(scale); - y[i].m = GGML_FP32_TO_FP16(min); - memcpy(&y[i].qh, &qh, sizeof(y[i].qh)); } } @@ -1559,6 +1583,87 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } +static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, + uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, + float rmin, float rdelta, int nstep, bool use_mad) { + float min = x[0]; + float max = x[0]; + float sum_w = weights[0]; + float sum_x = sum_w * x[0]; +#ifdef HAVE_BUGGY_APPLE_LINKER + // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 + for (volatile int i = 1; i < n; ++i) { +#else + for (int i = 1; i < n; ++i) { +#endif + if (x[i] < min) min = x[i]; + if (x[i] > max) max = x[i]; + float w = weights[i]; + sum_w += w; + sum_x += w * x[i]; + } + if (min > 0) min = 0; + if (max == min) { + for (int i = 0; i < n; ++i) L[i] = 0; + *the_min = -min; + return 0.f; + } + float iscale = nmax/(max - min); + float scale = 1/iscale; + float best_mad = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + L[i] = MAX(0, MIN(nmax, l)); + float diff = scale * L[i] + min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + best_mad += w * diff; + } + if (nstep < 1) { + *the_min = -min; + return scale; + } + for (int is = 0; is <= nstep; ++is) { + iscale = (rmin + rdelta*is + nmax)/(max - min); + float sum_l = 0, sum_l2 = 0, sum_xl = 0; + for (int i = 0; i < n; ++i) { + int l = nearest_int(iscale*(x[i] - min)); + l = MAX(0, MIN(nmax, l)); + Laux[i] = l; + float w = weights[i]; + sum_l += w*l; + sum_l2 += w*l*l; + sum_xl += w*l*x[i]; + } + float D = sum_w * sum_l2 - sum_l * sum_l; + if (D > 0) { + float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; + float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; + if (this_min > 0) { + this_min = 0; + this_scale = sum_xl / sum_l2; + } + float mad = 0; + for (int i = 0; i < n; ++i) { + float diff = this_scale * Laux[i] + this_min - x[i]; + diff = use_mad ? fabsf(diff) : diff * diff; + float w = weights[i]; + mad += w * diff; + } + if (mad < best_mad) { + for (int i = 0; i < n; ++i) { + L[i] = Laux[i]; + } + best_mad = mad; + scale = this_scale; + min = this_min; + } + } + } + *the_min = -min; + return scale; +} + #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { if (j < 4) { @@ -1577,14 +1682,59 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict const int nb = k / QK_K; uint8_t L[QK_K]; + uint8_t Laux[16]; + float weights[16]; + float mins[QK_K/16]; + float scales[QK_K/16]; + + const float q4scale = 15.f; for (int i = 0; i < nb; i++) { - int block_scales[QK_K/16]; - int block_mins[QK_K/16]; - quantize_q_k_1(x, 2, 4, 16, block_scales, block_mins, L, &y[i].d, &y[i].dmin); + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/16; ++j) { + for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); + scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + if (max_scale > 0) { + float iscale = q4scale/max_scale; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*scales[j]); + y[i].scales[j] = l; + } + y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale); + } else { + for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; + y[i].d = GGML_FP32_TO_FP16(0.f); + } + if (max_min > 0) { + float iscale = q4scale/max_min; + for (int j = 0; j < QK_K/16; ++j) { + int l = nearest_int(iscale*mins[j]); + y[i].scales[j] |= (l << 4); + } + y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale); + } else { + y[i].dmin = GGML_FP32_TO_FP16(0.f); + } for (int j = 0; j < QK_K/16; ++j) { - y[i].scales[j] = (block_mins[j] << 4) | (block_scales[j]); + const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF); + if (!d) continue; + const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4); + for (int ii = 0; ii < 16; ++ii) { + int l = nearest_int((x[16*j + ii] + dm)/d); + l = MAX(0, MIN(3, l)); + L[16*j + ii] = l; + } } #if QK_K == 256 @@ -1706,19 +1856,17 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } int8_t sc; - float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; float d = GGML_FP16_TO_FP32(y[i].d) * sc; + if (!d) { + continue; + } for (int ii = 0; ii < 16; ++ii) { - int l = 0; - if (d) { - l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - } + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); L[16*j + ii] = l + 4; - q_fit[16*j + ii] = l * sc; } } #else @@ -1738,18 +1886,16 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } y[i].d = GGML_FP32_TO_FP16(0.f); } - float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8); + if (!d) { + continue; + } for (int ii = 0; ii < 16; ++ii) { - int l = 0; - if (d) { - l = nearest_int(x[16*j + ii]/d); - l = MAX(-4, MIN(3, l)); - } + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-4, MIN(3, l)); L[16*j + ii] = l + 4; - q_fit[16*j + ii] = l * (s - 8); } } #endif @@ -2015,8 +2161,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict #if QK_K == 256 uint8_t L[QK_K]; - float mins[QK_K/32]; - float scales[QK_K/32]; #else int8_t L[QK_K]; float scales[QK_K/16]; @@ -2216,17 +2360,15 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); } - float q_fit[QK_K]; for (int j = 0; j < QK_K/16; ++j) { float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; + if (!d) { + continue; + } for (int ii = 0; ii < 16; ++ii) { - int l = 0; - if (d) { - l = nearest_int(x[16*j + ii]/d); - l = MAX(-32, MIN(31, l)); - } + int l = nearest_int(x[16*j + ii]/d); + l = MAX(-32, MIN(31, l)); L[16*j + ii] = l + 32; - q_fit[16*j + ii] = l * y[i].scales[j]; } } From 5a02328d1f872e2c73bd5e5cbb69a9dc792dd547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Sat, 30 Dec 2023 18:31:46 +0200 Subject: [PATCH 09/13] No second least squares pass --- ggml-quants.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index eb7da29adf562..c2628badfdeb9 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef __ARM_NEON @@ -457,20 +458,6 @@ static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int } } -static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) { - // Least squares fits `d * q = x` for d. - float qs2 = 0.0f; - float xq = 0.0f; - for (int i = 0; i < qk; i++) { - qs2 += q[i]*q[i]; - xq += x[i]*q[i]; - } - if (qs2 == 0.0f) { - return 0.0f; - } - return xq / qs2; -} - static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) { // Least squares fits `d * q = x` for d. float qs2 = 0.0f; @@ -1445,18 +1432,20 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ } } - // Least squares fit min and scale. - float min, scale; - lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale); - // Check for nans. - assert(min == min); - assert(scale == scale); - // Quantize to fp16 for the next pass. - max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group; - max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group; + if (pass == 0) { + // Least squares fit min and scale. + float min, scale; + lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale); + // Check for nans. + assert(min == min); + assert(scale == scale); + // Quantize to fp16 for the next pass. + max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group; + max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group; - *block_scale = GGML_FP32_TO_FP16(scale); - *block_min = GGML_FP32_TO_FP16(min); + *block_scale = GGML_FP32_TO_FP16(scale); + *block_min = GGML_FP32_TO_FP16(min); + } } } From 4b06507172c9fb912d7d9f778dbe3aa116f5fb41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Mon, 1 Jan 2024 13:59:42 +0200 Subject: [PATCH 10/13] Cleanup --- ggml-quants.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index c2628badfdeb9..c0724e94767e2 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1325,16 +1325,16 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ // If all the weight are positive we can invert the sign of min. // Otherwise blocks with all positive weights need to be quantized with zero // min, because min scale is unsigned. - int all_positive = 1; + bool all_positive = true; for (int j = 0; j < QK_K; j++) { if (x[j] < 0.0f) { - all_positive = 0; + all_positive = false; break; } } - float scales[QK_K]; - float mins[QK_K]; + float scales[QK_K/block_size]; + float mins[QK_K/block_size]; for (int j = 0; j < QK_K/block_size; j++) { uint8_t q[QK_K/block_size]; @@ -1343,7 +1343,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ // Flip the sign because quantize_1 assumes that min is added, but min // is subtracted in k-quants. mins[j] = -mins[j]; - if (!all_positive && mins[j] < 0) { + if ((!all_positive && mins[j] < 0) || (all_positive && mins[j] > 0)) { // All weights are positive in this block, but some blocks have // negative weights. Find new least squares scale with zero min. mins[j] = 0.0f; @@ -1366,18 +1366,18 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ // Increasing passes would decrease RMS error by miniscule amount with // drawback of taking more time. for(int pass = 0; pass < 2; pass++) { - float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale; - float inv_min = max_min == 0.0f ? 0.0f : max_group/max_min; + float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale; + float inv_min = max_min == 0.0f ? 0.0f : max_group/max_min; + float block_d = max_scale/max_group; + float block_dmin = max_min/max_group; for (int j = 0; j < QK_K/block_size; ++j) { - uint8_t ls = nearest_int(inv_scale*scales[j]); - uint8_t lm = nearest_int(inv_min*mins[j]); - uint8_t best_lm = lm; - uint8_t best_ls = ls; + uint8_t ls = MAX(0, nearest_int(inv_scale*scales[j])); + uint8_t lm = MAX(0, nearest_int(inv_min*mins[j])); ls = MIN(max_group, ls); lm = MIN(max_group, lm); + uint8_t best_lm = lm; + uint8_t best_ls = ls; float best_rms = FLT_MAX; - const float d1 = max_scale / max_group; - const float dmin1 = max_min / max_group; int limit = 1; // Increase limit for minor RMS error decrease while increasing the // quantization run time. @@ -1390,14 +1390,14 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) { float rms = 0.0f; for (int ii = 0; ii < block_size; ii++) { - const float d = d1 * lst; - const float dm1 = dmin1 * lmt; + const float d = block_d * lst; + const float dm = block_dmin * lmt; int l1 = 0; if (d) { - l1 = nearest_int((x[block_size*j + ii] + dm1)/d); + l1 = nearest_int((x[block_size*j + ii] + dm)/d); l1 = MAX(0, MIN((1 << bits) - 1, l1)); } - const float e = (d*l1 - dm1) - x[block_size*j + ii]; + const float e = (d*l1 - dm) - x[block_size*j + ii]; rms += e*e; } if (rms < best_rms) { @@ -1411,8 +1411,6 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ block_mins[j] = best_lm; } - float block_d = max_scale/max_group; - float block_dmin = max_min/max_group; float q_fit[QK_K]; float q_m[QK_K/block_size]; From 7e03872f1eba9fece7f740625048df49aa71f23e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Tue, 2 Jan 2024 15:44:45 +0200 Subject: [PATCH 11/13] Fix QKK_64 --- ggml-quants.c | 88 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index c0724e94767e2..91cd2f0d917ed 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -5,7 +5,6 @@ #include #include #include -#include #ifdef __ARM_NEON @@ -2041,6 +2040,29 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } #else + float weights[32]; + float mins[QK_K/32]; + float scales[QK_K/32]; + uint8_t Laux[32]; + float max_scale = 0; // as we are deducting the min, scales are always positive + float max_min = 0; + for (int j = 0; j < QK_K/32; ++j) { + //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f); + float sum_x2 = 0; + for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l]; + float av_x = sqrtf(sum_x2/32); + for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]); + scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false); + float scale = scales[j]; + if (scale > max_scale) { + max_scale = scale; + } + float min = mins[j]; + if (min > max_min) { + max_min = min; + } + } + const float s_factor = 15.f; float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f; float inv_min = max_min > 0 ? s_factor/max_min : 0.f; @@ -2156,43 +2178,43 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict for (int i = 0; i < nb; i++) { #if QK_K == 256 - int block_scales[QK_K/32]; - int block_mins[QK_K/32]; - quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); - - for (int j = 0; j < QK_K/32; ++j) { - int ls = block_scales[j]; - int lm = block_mins[j]; - if (j < 4) { - y[i].scales[j] = ls; - y[i].scales[j+4] = lm; - } else { - y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); - y[i].scales[j-4] |= ((ls >> 4) << 6); - y[i].scales[j-0] |= ((lm >> 4) << 6); + int block_scales[QK_K/32]; + int block_mins[QK_K/32]; + quantize_q_k_1(x, 5, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin); + + for (int j = 0; j < QK_K/32; ++j) { + int ls = block_scales[j]; + int lm = block_mins[j]; + if (j < 4) { + y[i].scales[j] = ls; + y[i].scales[j+4] = lm; + } else { + y[i].scales[j+4] = (ls & 0xF) | ((lm & 0xF) << 4); + y[i].scales[j-4] |= ((ls >> 4) << 6); + y[i].scales[j-0] |= ((lm >> 4) << 6); + } } - } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; - memset(qh, 0, QK_K/8); + uint8_t * restrict qh = y[i].qh; + uint8_t * restrict ql = y[i].qs; + memset(qh, 0, QK_K/8); - uint8_t m1 = 1, m2 = 2; - for (int n = 0; n < QK_K; n += 64) { - for (int j = 0; j < 32; ++j) { - int l1 = L[n + j]; - if (l1 > 15) { - l1 -= 16; qh[j] |= m1; - } - int l2 = L[n + j + 32]; - if (l2 > 15) { - l2 -= 16; qh[j] |= m2; + uint8_t m1 = 1, m2 = 2; + for (int n = 0; n < QK_K; n += 64) { + for (int j = 0; j < 32; ++j) { + int l1 = L[n + j]; + if (l1 > 15) { + l1 -= 16; qh[j] |= m1; + } + int l2 = L[n + j + 32]; + if (l2 > 15) { + l2 -= 16; qh[j] |= m2; + } + ql[j] = l1 | (l2 << 4); } - ql[j] = l1 | (l2 << 4); + m1 <<= 2; m2 <<= 2; + ql += 32; } - m1 <<= 2; m2 <<= 2; - ql += 32; - } #else float max_scale = 0, amax = 0; for (int j = 0; j < QK_K/16; ++j) { From 6683eb496061c4d7b9055c55f94ae6f5f944272f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Tue, 2 Jan 2024 20:07:34 +0200 Subject: [PATCH 12/13] Fix buffer size --- ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-quants.c b/ggml-quants.c index 91cd2f0d917ed..aea9bae0407ac 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1336,7 +1336,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ float mins[QK_K/block_size]; for (int j = 0; j < QK_K/block_size; j++) { - uint8_t q[QK_K/block_size]; + uint8_t q[block_size]; // First find least squares solution for min and scale for each block. quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]); // Flip the sign because quantize_1 assumes that min is added, but min From 9c11f61ff1d2fe1ab1007fcbc315ffdf966a6e27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= Date: Tue, 2 Jan 2024 20:18:48 +0200 Subject: [PATCH 13/13] Simpler array sizes for msvc --- ggml-quants.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index aea9bae0407ac..a21815dad7d18 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1332,11 +1332,11 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ } } - float scales[QK_K/block_size]; - float mins[QK_K/block_size]; + float scales[QK_K]; + float mins[QK_K]; for (int j = 0; j < QK_K/block_size; j++) { - uint8_t q[block_size]; + uint8_t q[QK_K]; // First find least squares solution for min and scale for each block. quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]); // Flip the sign because quantize_1 assumes that min is added, but min @@ -1411,7 +1411,7 @@ static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_ } float q_fit[QK_K]; - float q_m[QK_K/block_size]; + float q_m[QK_K]; // Quantize elements and populate arrays needed for least squares fit. for (int j = 0; j < QK_K/block_size; ++j) {