Skip to content

Commit 48c84f7

Browse files
committed
q4_0c: AVX512 vec_dot and quantize impl
1 parent 0365c68 commit 48c84f7

File tree

2 files changed

+125
-18
lines changed

2 files changed

+125
-18
lines changed

ggml.c

Lines changed: 124 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,17 +1437,17 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
14371437

14381438
// reference implementation for deterministic creation of model files
14391439
static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
1440-
assert(k % QK8_0 == 0);
1441-
const int nb = k / QK8_0;
1440+
assert(k % QK8_0C == 0);
1441+
const int nb = k / QK8_0C;
14421442

14431443
uint8_t * restrict qs = y;
14441444
float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
14451445

14461446
for (int i = 0; i < nb; i++) {
14471447
float amax = 0.0f; // absolute max
14481448

1449-
for (int l = 0; l < QK8_0; l++) {
1450-
const float v = x[i*QK8_0 + l];
1449+
for (int l = 0; l < QK8_0C; l++) {
1450+
const float v = x[i*QK8_0C + l];
14511451
amax = MAX(amax, fabsf(v));
14521452
}
14531453

@@ -1456,17 +1456,46 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri
14561456

14571457
ds[i] = d;
14581458

1459-
for (int l = 0; l < QK8_0; ++l) {
1460-
const float v = x[i*QK8_0 + l]*id;
1461-
qs[i*QK8_0 + l] = roundf(v);
1459+
for (int l = 0; l < QK8_0C; ++l) {
1460+
const float v = x[i*QK8_0C + l]*id;
1461+
qs[i*QK8_0C + l] = roundf(v);
14621462
}
14631463
}
14641464
}
14651465

14661466
static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
1467-
assert(k % QK8_0 == 0);
1467+
assert(k % QK8_0C == 0);
1468+
const int nb = k / QK8_0C;
1469+
1470+
int8_t * restrict qs = vy;
1471+
float * restrict ds = (float *) ((uint8_t *) vy + nb*QK8_0C);
1472+
1473+
#if __AVX512F__
1474+
for (int i = 0; i < nb; i++) {
1475+
const __m512 x0 = _mm512_loadu_ps( x + i*QK8_0C );
1476+
const __m512 x1 = _mm512_loadu_ps( x + i*QK8_0C + QK8_0C/2);
1477+
1478+
// Find absolute max
1479+
const __m512 x0abs = _mm512_abs_ps(x0);
1480+
const __m512 x1abs = _mm512_abs_ps(x1);
1481+
const float amax = _mm512_reduce_max_ps(_mm512_max_ps(x0abs, x1abs));
1482+
1483+
const float d = amax / ((1 << 7) - 1);
1484+
const float id = d ? 1.0f/d : 0.0f;
1485+
1486+
ds[i] = d;
14681487

1488+
const __m512 mul = _mm512_set1_ps( id );
1489+
const __m512i x0q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x0, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
1490+
const __m512i x1q = _mm512_cvt_roundps_epi32(_mm512_mul_ps(x1, mul), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
1491+
1492+
_mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C, 0xffff, x0q);
1493+
_mm512_mask_cvtepi32_storeu_epi8(qs + i*QK8_0C + QK8_0C/2, 0xffff, x1q);
1494+
}
1495+
#else
1496+
// scalar
14691497
quantize_row_q8_0c_reference(x, vy, k);
1498+
#endif
14701499
}
14711500

14721501
static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
@@ -2364,6 +2393,73 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
23642393
*s = sumf;
23652394
}
23662395

2396+
#if __AVX512F__ && QK4_0 == 32
2397+
2398+
// Dot product of four blocks of q4_0c with four blocks of q8_0c
2399+
static inline __m512 dot_q4_0c_fourblocks_avx512(
2400+
__m512 acc,
2401+
const uint8_t * restrict xqs,
2402+
const float * restrict xds,
2403+
const int8_t * restrict yqs,
2404+
const float * restrict yds
2405+
) {
2406+
// load quantized bytes
2407+
// TODO: change back to aligned loads
2408+
const __m512i xqs0123 = _mm512_loadu_epi64( xqs );
2409+
const __m512i low_nibble_mask = _mm512_set1_epi8( 0xf );
2410+
const __m512i xqs01 = _mm512_and_si512( low_nibble_mask, xqs0123 );
2411+
// TODO: try srlv/i?
2412+
const __m512i xqs23 = _mm512_and_si512( low_nibble_mask, _mm512_srli_epi32( xqs0123, 4 ) );
2413+
const __m512i yqs01 = _mm512_loadu_epi64( yqs );
2414+
const __m512i yqs23 = _mm512_loadu_epi64( yqs + 2*QK8_0C );
2415+
2416+
// load scales
2417+
const __m512i scale_mask0 = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);
2418+
const __m512i scale_mask1 = _mm512_set_epi32(3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2);
2419+
const __m128 xyds = _mm_mul_ps(_mm_load_ps(xds), _mm_load_ps(yds));
2420+
const __m512 xyds0123 = _mm512_broadcast_f32x4(xyds);
2421+
const __m512 xyds01 = _mm512_permutevar_ps(xyds0123, scale_mask0);
2422+
const __m512 xyds23 = _mm512_permutevar_ps(xyds0123, scale_mask1);
2423+
2424+
// take dot product of x and y bytes
2425+
const __m512i plus_8 = _mm512_set1_epi8( 8 );
2426+
#ifdef __AVX512VNNI__
2427+
// We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch:
2428+
// the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8
2429+
// from each nibble, so they can be negative. So, instead of `(xqs01 - 8) * yqs01`,
2430+
// we compute `xqs01 * yqs01 - 8 * yqks`.
2431+
const __m512i zero = _mm512_setzero_epi32();
2432+
const __m512i yqs01_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs01 );
2433+
const __m512i yqs23_mul8 = _mm512_dpbusds_epi32( zero, plus_8, yqs23 );
2434+
const __m512i xy01 = _mm512_dpbusds_epi32( zero, xqs01, yqs01 );
2435+
const __m512i xy23 = _mm512_dpbusds_epi32( zero, xqs23, yqs23 );
2436+
const __m512i res0_int = _mm512_sub_epi32( xy01, yqs01_mul8 );
2437+
const __m512i res1_int = _mm512_sub_epi32( xy23, yqs23_mul8 );
2438+
#else
2439+
// As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones.
2440+
// It has the same catch as VPDPBUSDS: the left operand should be unsigned.
2441+
// This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me
2442+
// ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119
2443+
const __m512i one = _mm512_set1_epi16( 1 );
2444+
const __m512i prod_0 = _mm512_maddubs_epi16( xqs01, yqs01 );
2445+
const __m512i prod_1 = _mm512_maddubs_epi16( plus_8, yqs01 );
2446+
const __m512i prod_2 = _mm512_maddubs_epi16( xqs23, yqs23 );
2447+
const __m512i prod_3 = _mm512_maddubs_epi16( plus_8, yqs23 );
2448+
const __m512i diff0 = _mm512_sub_epi16( prod_0, prod_1 );
2449+
const __m512i diff1 = _mm512_sub_epi16( prod_2, prod_3 );
2450+
const __m512i res0_int = _mm512_madd_epi16( diff0, one );
2451+
const __m512i res1_int = _mm512_madd_epi16( diff1, one );
2452+
#endif
2453+
2454+
// Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate.
2455+
const __m512 res0_float = _mm512_cvtepi32_ps( res0_int );
2456+
const __m512 res1_float = _mm512_cvtepi32_ps( res1_int );
2457+
2458+
return _mm512_fmadd_ps( xyds23, res1_float,
2459+
_mm512_fmadd_ps( xyds01, res0_float, acc ));
2460+
}
2461+
#endif
2462+
23672463
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
23682464
ggml_float sumf = 0.0;
23692465

@@ -2610,6 +2706,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
26102706

26112707
float sumf = 0.0;
26122708

2709+
#if __AVX512F__
2710+
// Initialize accumulator with zeros
2711+
__m512 acc = _mm512_setzero_ps();
2712+
for (int i = 0; i < nb; i += 4) {
2713+
acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i);
2714+
}
2715+
// Horizontal sum of all lanes of the accumulator
2716+
sumf = _mm512_reduce_add_ps( acc );
2717+
#else
26132718
// scalar
26142719
for (int i = 0; i < nb/2; i++) {
26152720
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
@@ -2620,23 +2725,25 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
26202725
const float dy0 = yds[dst0];
26212726
const float dy1 = yds[dst1];
26222727

2623-
int sumi0 = 0;
2624-
int sumi1 = 0;
2728+
// NOTE: having these as plain int triggers a bug with AVX512 on GCC 12.2
2729+
int64_t sumi0 = 0;
2730+
int64_t sumi1 = 0;
26252731

26262732
for (int l = 0; l < QK4_0; l++) {
2627-
const uint8_t v0 = xqs[i*QK4_0 + l];
2733+
const uint8_t v0 = xqs[i*QK4_0 + l];
26282734

2629-
const int i0 = (int8_t) (v0 & 0xf) - 8;
2630-
const int i1 = (int8_t) (v0 >> 4) - 8;
2735+
const int i0 = (int) (v0 & 0xf) - 8;
2736+
const int i1 = (int) (v0 >> 4) - 8;
26312737

2632-
const int i2 = yqs[dst0*QK4_0 + l];
2633-
const int i3 = yqs[dst1*QK4_0 + l];
2738+
const int i2 = yqs[dst0*QK4_0 + l];
2739+
const int i3 = yqs[dst1*QK4_0 + l];
26342740

2635-
sumi0 += i0*i2;
2636-
sumi1 += i1*i3;
2741+
sumi0 += i0*i2;
2742+
sumi1 += i1*i3;
26372743
}
26382744
sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
26392745
}
2746+
#endif
26402747

26412748
*s = sumf;
26422749
}

llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
839839
case LLAMA_FTYPE_ALL_F32: return "all F32";
840840
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
841841
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
842-
case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_1C";
842+
case LLAMA_FTYPE_MOSTLY_Q4_0C: return "mostly Q4_0C";
843843
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
844844
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
845845
return "mostly Q4_1, some F16";

0 commit comments

Comments
 (0)