@@ -32,14 +32,22 @@ typedef half ggml_fp16_t;
32
32
#endif
33
33
#endif
34
34
35
+ // QK = number of values after dequantization
36
+ // QR = QK / number of values before dequantization
37
+ // QI = number of 32 bit integers before dequantization
38
+
35
39
#define QK4_0 32
40
+ #define QI4_0 (QK4_0 / (4 * QR4_0))
41
+ #define QR4_0 2
36
42
typedef struct {
37
43
ggml_fp16_t d; // delta
38
44
uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
39
45
} block_q4_0;
40
46
static_assert (sizeof (block_q4_0) == sizeof (ggml_fp16_t ) + QK4_0 / 2 , " wrong q4_0 block size/padding" );
41
47
42
48
#define QK4_1 32
49
+ #define QI4_1 (QK4_1 / (4 * QR4_1))
50
+ #define QR4_1 2
43
51
typedef struct {
44
52
ggml_fp16_t d; // delta
45
53
ggml_fp16_t m; // min
@@ -48,6 +56,8 @@ typedef struct {
48
56
static_assert (sizeof (block_q4_1) == 2 * sizeof (ggml_fp16_t ) + QK4_1 / 2 , " wrong q4_1 block size/padding" );
49
57
50
58
#define QK5_0 32
59
+ #define QI5_0 (QK5_0 / (4 * QR5_0))
60
+ #define QR5_0 2
51
61
typedef struct {
52
62
ggml_fp16_t d; // delta
53
63
uint8_t qh[4 ]; // 5-th bit of quants
@@ -56,6 +66,8 @@ typedef struct {
56
66
static_assert (sizeof (block_q5_0) == sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_0 / 2 , " wrong q5_0 block size/padding" );
57
67
58
68
#define QK5_1 32
69
+ #define QI5_1 (QK5_1 / (4 * QR5_1))
70
+ #define QR5_1 2
59
71
typedef struct {
60
72
ggml_fp16_t d; // delta
61
73
ggml_fp16_t m; // min
@@ -65,13 +77,17 @@ typedef struct {
65
77
static_assert (sizeof (block_q5_1) == 2 * sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_1 / 2 , " wrong q5_1 block size/padding" );
66
78
67
79
#define QK8_0 32
80
+ #define QI8_0 (QK8_0 / (4 * QR8_0))
81
+ #define QR8_0 1
68
82
typedef struct {
69
83
ggml_fp16_t d; // delta
70
84
int8_t qs[QK8_0]; // quants
71
85
} block_q8_0;
72
86
static_assert (sizeof (block_q8_0) == sizeof (ggml_fp16_t ) + QK8_0, " wrong q8_0 block size/padding" );
73
87
74
88
#define QK8_1 32
89
+ #define QI8_1 (QK8_1 / (4 * QR8_1))
90
+ #define QR8_1 1
75
91
typedef struct {
76
92
float d; // delta
77
93
float s; // d * sum(qs[i])
@@ -96,6 +112,8 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
96
112
// weight is represented as x = a * q + b
97
113
// 16 blocks of 16 elements each
98
114
// Effectively 2.625 bits per weight
115
+ #define QI2_K (QK_K / (4 *QR2_K))
116
+ #define QR2_K 4
99
117
typedef struct {
100
118
uint8_t scales[QK_K/16 ]; // scales and mins, quantized with 4 bits
101
119
uint8_t qs[QK_K/4 ]; // quants
@@ -108,6 +126,8 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
108
126
// weight is represented as x = a * q
109
127
// 16 blocks of 16 elements each
110
128
// Effectively 3.4375 bits per weight
129
+ #define QI3_K (QK_K / (4 *QR3_K))
130
+ #define QR3_K 4
111
131
#ifdef GGML_QKK_64
112
132
typedef struct {
113
133
uint8_t hmask[QK_K/8 ]; // quants - high bit
@@ -130,6 +150,8 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
130
150
// 8 blocks of 32 elements each
131
151
// weight is represented as x = a * q + b
132
152
// Effectively 4.5 bits per weight
153
+ #define QI4_K (QK_K / (4 *QR4_K))
154
+ #define QR4_K 2
133
155
#ifdef GGML_QKK_64
134
156
typedef struct {
135
157
ggml_fp16_t d[2 ]; // super-block scales/mins
@@ -151,6 +173,8 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
151
173
// 8 blocks of 32 elements each
152
174
// weight is represented as x = a * q + b
153
175
// Effectively 5.5 bits per weight
176
+ #define QI5_K (QK_K / (4 *QR5_K))
177
+ #define QR5_K 2
154
178
#ifdef GGML_QKK_64
155
179
typedef struct {
156
180
ggml_fp16_t d; // super-block scale
@@ -174,6 +198,8 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
174
198
// weight is represented as x = a * q
175
199
// 16 blocks of 16 elements each
176
200
// Effectively 6.5625 bits per weight
201
+ #define QI6_K (QK_K / (4 *QR6_K))
202
+ #define QR6_K 2
177
203
typedef struct {
178
204
uint8_t ql[QK_K/2 ]; // quants, lower 4 bits
179
205
uint8_t qh[QK_K/4 ]; // quants, upper 2 bits
@@ -193,13 +219,17 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
193
219
// (Almost) "true" 2-bit quantization.
194
220
// Due to the need to use blocks as per ggml design, it ends up using
195
221
// 2.0625 bpw because of the 16-bit scale for each block of 256.
222
+ #define QI2_XXS (QK_K / (4 *QR2_XXS))
223
+ #define QR2_XXS 8
196
224
typedef struct {
197
225
ggml_fp16_t d;
198
226
uint16_t qs[QK_K/8 ];
199
227
} block_iq2_xxs;
200
228
static_assert (sizeof (block_iq2_xxs) == sizeof (ggml_fp16_t ) + QK_K/8 *sizeof (uint16_t ), " wrong iq2_xxs block size/padding" );
201
229
202
230
// 2.3125 bpw quants
231
+ #define QI2_XS (QK_K / (4 *QR2_XS))
232
+ #define QR2_XS 8
203
233
typedef struct {
204
234
ggml_fp16_t d;
205
235
uint16_t qs[QK_K/8 ];
@@ -208,6 +238,8 @@ typedef struct {
208
238
static_assert (sizeof (block_iq2_xs) == sizeof (ggml_fp16_t ) + QK_K/8 *sizeof (uint16_t ) + QK_K/32 , " wrong iq2_xs block size/padding" );
209
239
210
240
// 2.5625 bpw quants
241
+ #define QI2_S (QK_K / (4 *QR2_S))
242
+ #define QR2_S 8
211
243
typedef struct {
212
244
ggml_fp16_t d;
213
245
uint8_t qs[QK_K/4 ];
@@ -219,6 +251,8 @@ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wr
219
251
// (Almost) "true" 3-bit quantization.
220
252
// Due to the need to use blocks as per ggml design, it ends up using
221
253
// 3.0625 bpw because of the 16-bit scale for each block of 256.
254
+ #define QI3_XXS (QK_K / (4 *QR3_XXS))
255
+ #define QR3_XXS 8
222
256
typedef struct {
223
257
ggml_fp16_t d;
224
258
uint8_t qs[3 *QK_K/8 ];
@@ -231,6 +265,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
231
265
#else
232
266
#define IQ3S_N_SCALE QK_K/64
233
267
#endif
268
+ #define QI3_XS (QK_K / (4 *QR3_XS))
269
+ #define QR3_XS 8
234
270
typedef struct {
235
271
ggml_fp16_t d;
236
272
uint8_t qs[QK_K/4 ];
@@ -240,6 +276,8 @@ typedef struct {
240
276
} block_iq3_s;
241
277
static_assert (sizeof (block_iq3_s) == sizeof (ggml_fp16_t ) + 13 *(QK_K/32 ) + IQ3S_N_SCALE, " wrong iq3_s block size/padding" );
242
278
279
+ #define QI1_S (QK_K / (4 *QR1_S))
280
+ #define QR1_S 8
243
281
typedef struct {
244
282
ggml_fp16_t d;
245
283
uint8_t qs[QK_K/8 ];
@@ -249,6 +287,8 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wr
249
287
250
288
// Non-linear quants
251
289
#define QK4_NL 32
290
+ #define QI4_NL (QK4_NL / (4 *QR4_NL))
291
+ #define QR4_NL 2
252
292
typedef struct {
253
293
ggml_fp16_t d;
254
294
uint8_t qs[QK4_NL/2 ];
@@ -257,8 +297,12 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
257
297
258
298
#if QK_K == 64
259
299
#define block_iq4_xs block_iq4_nl
300
+ #define QI4_XS QI4_NL
301
+ #define QR4_XS QR4_NL
260
302
// typedef struct block_iq4_nl block_iq4_xs;
261
303
#else
304
+ #define QI4_XS (QK_K / (4 *QR4_XS))
305
+ #define QR4_XS 8
262
306
typedef struct {
263
307
ggml_fp16_t d;
264
308
uint16_t scales_h;
0 commit comments