Skip to content

Commit bd5f803

Browse files
committed
ggml : reuse quantum structs across backends
1 parent a167b6d commit bd5f803

File tree

6 files changed

+289
-421
lines changed

6 files changed

+289
-421
lines changed

ggml-common.h

Lines changed: 277 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,279 @@
1-
#pragma once
1+
#ifndef GGML_COMMON_DECL
2+
3+
#if defined(GGML_COMMON_DECL_C)
4+
#include <stdint.h>
5+
6+
typedef uint16_t ggml_fp16_t;
7+
8+
#define GGML_COMMON_DECL
9+
#elif defined(GGML_COMMON_DECL_METAL)
10+
#include <metal_stdlib>
11+
12+
typedef half ggml_fp16_t;
13+
14+
#define GGML_COMMON_DECL
15+
#elif defined(GGML_COMMON_DECL_CUDA)
16+
#include <cstdint>
17+
18+
typedef half ggml_fp16_t;
19+
20+
#define GGML_COMMON_DECL
21+
#endif
22+
23+
#if defined(GGML_COMMON_DECL)
24+
25+
#ifndef __cplusplus
26+
#ifndef static_assert
27+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
28+
#define static_assert(cond, msg) _Static_assert(cond, msg)
29+
#else
30+
#define static_assert(cond, msg) struct global_scope_noop_trick
31+
#endif
32+
#endif
33+
#endif
34+
35+
#define QK4_0 32
36+
typedef struct {
37+
ggml_fp16_t d; // delta
38+
uint8_t qs[QK4_0 / 2]; // nibbles / quants
39+
} block_q4_0;
40+
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
41+
42+
#define QK4_1 32
43+
typedef struct {
44+
ggml_fp16_t d; // delta
45+
ggml_fp16_t m; // min
46+
uint8_t qs[QK4_1 / 2]; // nibbles / quants
47+
} block_q4_1;
48+
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
49+
50+
#define QK5_0 32
51+
typedef struct {
52+
ggml_fp16_t d; // delta
53+
uint8_t qh[4]; // 5-th bit of quants
54+
uint8_t qs[QK5_0 / 2]; // nibbles / quants
55+
} block_q5_0;
56+
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
57+
58+
#define QK5_1 32
59+
typedef struct {
60+
ggml_fp16_t d; // delta
61+
ggml_fp16_t m; // min
62+
uint8_t qh[4]; // 5-th bit of quants
63+
uint8_t qs[QK5_1 / 2]; // nibbles / quants
64+
} block_q5_1;
65+
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
66+
67+
#define QK8_0 32
68+
typedef struct {
69+
ggml_fp16_t d; // delta
70+
int8_t qs[QK8_0]; // quants
71+
} block_q8_0;
72+
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
73+
74+
#define QK8_1 32
75+
typedef struct {
76+
float d; // delta
77+
float s; // d * sum(qs[i])
78+
int8_t qs[QK8_1]; // quants
79+
} block_q8_1;
80+
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
81+
82+
//
83+
// Super-block quantization structures
84+
//
85+
86+
// Super-block size
87+
#ifdef GGML_QKK_64
88+
#define QK_K 64
89+
#define K_SCALE_SIZE 4
90+
#else
91+
#define QK_K 256
92+
#define K_SCALE_SIZE 12
93+
#endif
94+
95+
// 2-bit quantization
96+
// weight is represented as x = a * q + b
97+
// 16 blocks of 16 elements each
98+
// Effectively 2.625 bits per weight
99+
typedef struct {
100+
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
101+
uint8_t qs[QK_K/4]; // quants
102+
ggml_fp16_t d; // super-block scale for quantized scales
103+
ggml_fp16_t dmin; // super-block scale for quantized mins
104+
} block_q2_K;
105+
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
106+
107+
// 3-bit quantization
108+
// weight is represented as x = a * q
109+
// 16 blocks of 16 elements each
110+
// Effectively 3.4375 bits per weight
111+
#ifdef GGML_QKK_64
112+
typedef struct {
113+
uint8_t hmask[QK_K/8]; // quants - high bit
114+
uint8_t qs[QK_K/4]; // quants - low 2 bits
115+
uint8_t scales[2];
116+
ggml_fp16_t d; // super-block scale
117+
} block_q3_K;
118+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
119+
#else
120+
typedef struct {
121+
uint8_t hmask[QK_K/8]; // quants - high bit
122+
uint8_t qs[QK_K/4]; // quants - low 2 bits
123+
uint8_t scales[12]; // scales, quantized with 6 bits
124+
ggml_fp16_t d; // super-block scale
125+
} block_q3_K;
126+
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
127+
#endif
128+
129+
// 4-bit quantization
130+
// 8 blocks of 32 elements each
131+
// weight is represented as x = a * q + b
132+
// Effectively 4.5 bits per weight
133+
#ifdef GGML_QKK_64
134+
typedef struct {
135+
ggml_fp16_t d[2]; // super-block scales/mins
136+
uint8_t scales[2]; // 4-bit block scales/mins
137+
uint8_t qs[QK_K/2]; // 4--bit quants
138+
} block_q4_K;
139+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
140+
#else
141+
typedef struct {
142+
ggml_fp16_t d; // super-block scale for quantized scales
143+
ggml_fp16_t dmin; // super-block scale for quantized mins
144+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
145+
uint8_t qs[QK_K/2]; // 4--bit quants
146+
} block_q4_K;
147+
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
148+
#endif
149+
150+
// 5-bit quantization
151+
// 8 blocks of 32 elements each
152+
// weight is represented as x = a * q + b
153+
// Effectively 5.5 bits per weight
154+
#ifdef GGML_QKK_64
155+
typedef struct {
156+
ggml_fp16_t d; // super-block scale
157+
int8_t scales[QK_K/16]; // 8-bit block scales
158+
uint8_t qh[QK_K/8]; // quants, high bit
159+
uint8_t qs[QK_K/2]; // quants, low 4 bits
160+
} block_q5_K;
161+
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
162+
#else
163+
typedef struct {
164+
ggml_fp16_t d; // super-block scale for quantized scales
165+
ggml_fp16_t dmin; // super-block scale for quantized mins
166+
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
167+
uint8_t qh[QK_K/8]; // quants, high bit
168+
uint8_t qs[QK_K/2]; // quants, low 4 bits
169+
} block_q5_K;
170+
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
171+
#endif
172+
173+
// 6-bit quantization
174+
// weight is represented as x = a * q
175+
// 16 blocks of 16 elements each
176+
// Effectively 6.5625 bits per weight
177+
typedef struct {
178+
uint8_t ql[QK_K/2]; // quants, lower 4 bits
179+
uint8_t qh[QK_K/4]; // quants, upper 2 bits
180+
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
181+
ggml_fp16_t d; // super-block scale
182+
} block_q6_K;
183+
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
184+
185+
// This is only used for intermediate quantization and dot products
186+
typedef struct {
187+
float d; // delta
188+
int8_t qs[QK_K]; // quants
189+
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
190+
} block_q8_K;
191+
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
192+
193+
// (Almost) "true" 2-bit quantization.
194+
// Due to the need to use blocks as per ggml design, it ends up using
195+
// 2.0625 bpw because of the 16-bit scale for each block of 256.
196+
typedef struct {
197+
ggml_fp16_t d;
198+
uint16_t qs[QK_K/8];
199+
} block_iq2_xxs;
200+
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
201+
202+
// 2.3125 bpw quants
203+
typedef struct {
204+
ggml_fp16_t d;
205+
uint16_t qs[QK_K/8];
206+
uint8_t scales[QK_K/32];
207+
} block_iq2_xs;
208+
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
209+
210+
// 2.5625 bpw quants
211+
typedef struct {
212+
ggml_fp16_t d;
213+
uint8_t qs[QK_K/4];
214+
uint8_t qh[QK_K/32];
215+
uint8_t scales[QK_K/32];
216+
} block_iq2_s;
217+
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
218+
219+
// (Almost) "true" 3-bit quantization.
220+
// Due to the need to use blocks as per ggml design, it ends up using
221+
// 3.0625 bpw because of the 16-bit scale for each block of 256.
222+
typedef struct {
223+
ggml_fp16_t d;
224+
uint8_t qs[3*QK_K/8];
225+
} block_iq3_xxs;
226+
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
227+
228+
// 3.4375 bpw
229+
#if QK_K == 64
230+
#define IQ3S_N_SCALE 2
231+
#else
232+
#define IQ3S_N_SCALE QK_K/64
233+
#endif
234+
typedef struct {
235+
ggml_fp16_t d;
236+
uint8_t qs[QK_K/4];
237+
uint8_t qh[QK_K/32];
238+
uint8_t signs[QK_K/8];
239+
uint8_t scales[IQ3S_N_SCALE];
240+
} block_iq3_s;
241+
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
242+
243+
typedef struct {
244+
ggml_fp16_t d;
245+
uint8_t qs[QK_K/8];
246+
uint8_t scales[QK_K/16];
247+
} block_iq1_s;
248+
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
249+
250+
// Non-linear quants
251+
#define QK4_NL 32
252+
typedef struct {
253+
ggml_fp16_t d;
254+
uint8_t qs[QK4_NL/2];
255+
} block_iq4_nl;
256+
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
257+
258+
#if QK_K == 64
259+
#define block_iq4_xs block_iq4_nl
260+
//typedef struct block_iq4_nl block_iq4_xs;
261+
#else
262+
typedef struct {
263+
ggml_fp16_t d;
264+
uint16_t scales_h;
265+
uint8_t scales_l[QK_K/64];
266+
uint8_t qs[QK_K/2];
267+
} block_iq4_xs;
268+
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
269+
#endif
270+
271+
#endif // GGML_COMMON_DECL
272+
#endif // GGML_COMMON_DECL
273+
274+
////////////////////////////////////////////////////////////////////////////////
275+
276+
#ifndef GGML_COMMON_IMPL
2277

3278
#if defined(GGML_COMMON_IMPL_C)
4279
#include <stdint.h>
@@ -777,3 +1052,4 @@ GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ2XXS)
7771052
GGML_TABLE_END()
7781053

7791054
#endif // GGML_COMMON_IMPL
1055+
#endif // GGML_COMMON_IMPL

ggml-cuda.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "ggml.h"
33
#include "ggml-backend-impl.h"
44

5+
#define GGML_COMMON_DECL_CUDA
56
#define GGML_COMMON_IMPL_CUDA
67
#include "ggml-common.h"
78

ggml-metal.m

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
336336
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
337337

338338
#ifdef GGML_QKK_64
339-
prep[@"QK_K"] = @(64);
339+
prep[@"GGML_QKK_64"] = @(1);
340340
#endif
341341

342342
MTLCompileOptions* options = [MTLCompileOptions new];

0 commit comments

Comments
 (0)