Skip to content

Commit 3656c76

Browse files
committed
cuda : reuse ggml-common
ggml-ci
1 parent bd5f803 commit 3656c76

File tree

2 files changed

+44
-228
lines changed

2 files changed

+44
-228
lines changed

ggml-common.h

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,22 @@ typedef half ggml_fp16_t;
3232
#endif
3333
#endif
3434

35+
// QK = number of values after dequantization
36+
// QR = QK / number of values before dequantization
37+
// QI = number of 32 bit integers before dequantization
38+
3539
#define QK4_0 32
40+
#define QI4_0 (QK4_0 / (4 * QR4_0))
41+
#define QR4_0 2
3642
typedef struct {
3743
ggml_fp16_t d; // delta
3844
uint8_t qs[QK4_0 / 2]; // nibbles / quants
3945
} block_q4_0;
4046
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
4147

4248
#define QK4_1 32
49+
#define QI4_1 (QK4_1 / (4 * QR4_1))
50+
#define QR4_1 2
4351
typedef struct {
4452
ggml_fp16_t d; // delta
4553
ggml_fp16_t m; // min
@@ -48,6 +56,8 @@ typedef struct {
4856
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
4957

5058
#define QK5_0 32
59+
#define QI5_0 (QK5_0 / (4 * QR5_0))
60+
#define QR5_0 2
5161
typedef struct {
5262
ggml_fp16_t d; // delta
5363
uint8_t qh[4]; // 5-th bit of quants
@@ -56,6 +66,8 @@ typedef struct {
5666
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
5767

5868
#define QK5_1 32
69+
#define QI5_1 (QK5_1 / (4 * QR5_1))
70+
#define QR5_1 2
5971
typedef struct {
6072
ggml_fp16_t d; // delta
6173
ggml_fp16_t m; // min
@@ -65,13 +77,17 @@ typedef struct {
6577
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
6678

6779
#define QK8_0 32
80+
#define QI8_0 (QK8_0 / (4 * QR8_0))
81+
#define QR8_0 1
6882
typedef struct {
6983
ggml_fp16_t d; // delta
7084
int8_t qs[QK8_0]; // quants
7185
} block_q8_0;
7286
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
7387

7488
#define QK8_1 32
89+
#define QI8_1 (QK8_1 / (4 * QR8_1))
90+
#define QR8_1 1
7591
typedef struct {
7692
float d; // delta
7793
float s; // d * sum(qs[i])
@@ -96,6 +112,8 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
96112
// weight is represented as x = a * q + b
97113
// 16 blocks of 16 elements each
98114
// Effectively 2.625 bits per weight
115+
#define QI2_K (QK_K / (4*QR2_K))
116+
#define QR2_K 4
99117
typedef struct {
100118
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
101119
uint8_t qs[QK_K/4]; // quants
@@ -108,6 +126,8 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
108126
// weight is represented as x = a * q
109127
// 16 blocks of 16 elements each
110128
// Effectively 3.4375 bits per weight
129+
#define QI3_K (QK_K / (4*QR3_K))
130+
#define QR3_K 4
111131
#ifdef GGML_QKK_64
112132
typedef struct {
113133
uint8_t hmask[QK_K/8]; // quants - high bit
@@ -130,6 +150,8 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
130150
// 8 blocks of 32 elements each
131151
// weight is represented as x = a * q + b
132152
// Effectively 4.5 bits per weight
153+
#define QI4_K (QK_K / (4*QR4_K))
154+
#define QR4_K 2
133155
#ifdef GGML_QKK_64
134156
typedef struct {
135157
ggml_fp16_t d[2]; // super-block scales/mins
@@ -151,6 +173,8 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
151173
// 8 blocks of 32 elements each
152174
// weight is represented as x = a * q + b
153175
// Effectively 5.5 bits per weight
176+
#define QI5_K (QK_K / (4*QR5_K))
177+
#define QR5_K 2
154178
#ifdef GGML_QKK_64
155179
typedef struct {
156180
ggml_fp16_t d; // super-block scale
@@ -174,6 +198,8 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
174198
// weight is represented as x = a * q
175199
// 16 blocks of 16 elements each
176200
// Effectively 6.5625 bits per weight
201+
#define QI6_K (QK_K / (4*QR6_K))
202+
#define QR6_K 2
177203
typedef struct {
178204
uint8_t ql[QK_K/2]; // quants, lower 4 bits
179205
uint8_t qh[QK_K/4]; // quants, upper 2 bits
@@ -193,13 +219,17 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
193219
// (Almost) "true" 2-bit quantization.
194220
// Due to the need to use blocks as per ggml design, it ends up using
195221
// 2.0625 bpw because of the 16-bit scale for each block of 256.
222+
#define QI2_XXS (QK_K / (4*QR2_XXS))
223+
#define QR2_XXS 8
196224
typedef struct {
197225
ggml_fp16_t d;
198226
uint16_t qs[QK_K/8];
199227
} block_iq2_xxs;
200228
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
201229

202230
// 2.3125 bpw quants
231+
#define QI2_XS (QK_K / (4*QR2_XS))
232+
#define QR2_XS 8
203233
typedef struct {
204234
ggml_fp16_t d;
205235
uint16_t qs[QK_K/8];
@@ -208,6 +238,8 @@ typedef struct {
208238
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
209239

210240
// 2.5625 bpw quants
241+
#define QI2_S (QK_K / (4*QR2_S))
242+
#define QR2_S 8
211243
typedef struct {
212244
ggml_fp16_t d;
213245
uint8_t qs[QK_K/4];
@@ -219,6 +251,8 @@ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wr
219251
// (Almost) "true" 3-bit quantization.
220252
// Due to the need to use blocks as per ggml design, it ends up using
221253
// 3.0625 bpw because of the 16-bit scale for each block of 256.
254+
#define QI3_XXS (QK_K / (4*QR3_XXS))
255+
#define QR3_XXS 8
222256
typedef struct {
223257
ggml_fp16_t d;
224258
uint8_t qs[3*QK_K/8];
@@ -231,6 +265,8 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong
231265
#else
232266
#define IQ3S_N_SCALE QK_K/64
233267
#endif
268+
#define QI3_XS (QK_K / (4*QR3_XS))
269+
#define QR3_XS 8
234270
typedef struct {
235271
ggml_fp16_t d;
236272
uint8_t qs[QK_K/4];
@@ -240,6 +276,8 @@ typedef struct {
240276
} block_iq3_s;
241277
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
242278

279+
#define QI1_S (QK_K / (4*QR1_S))
280+
#define QR1_S 8
243281
typedef struct {
244282
ggml_fp16_t d;
245283
uint8_t qs[QK_K/8];
@@ -249,6 +287,8 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wr
249287

250288
// Non-linear quants
251289
#define QK4_NL 32
290+
#define QI4_NL (QK4_NL / (4*QR4_NL))
291+
#define QR4_NL 2
252292
typedef struct {
253293
ggml_fp16_t d;
254294
uint8_t qs[QK4_NL/2];
@@ -257,8 +297,12 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4
257297

258298
#if QK_K == 64
259299
#define block_iq4_xs block_iq4_nl
300+
#define QI4_XS QI4_NL
301+
#define QR4_XS QR4_NL
260302
//typedef struct block_iq4_nl block_iq4_xs;
261303
#else
304+
#define QI4_XS (QK_K / (4*QR4_XS))
305+
#define QR4_XS 8
262306
typedef struct {
263307
ggml_fp16_t d;
264308
uint16_t scales_h;

0 commit comments

Comments
 (0)