Skip to content

Commit f9bbbe3

Browse files
committed
ggml : preserve old Q4 and Q5 formats
1 parent e116eb6 commit f9bbbe3

File tree

4 files changed

+62
-61
lines changed

4 files changed

+62
-61
lines changed

ggml-cuda.cu

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
8686
const int x0 = (x[i].qs[j] & 0xf) - 8;
8787
const int x1 = (x[i].qs[j] >> 4) - 8;
8888

89-
y[i*qk + j + 0 ] = x0*d;
90-
y[i*qk + j + qk/2] = x1*d;
89+
y[i*qk + 2*j + 0] = x0*d;
90+
y[i*qk + 2*j + 1] = x1*d;
9191
}
9292
}
9393

@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
105105
const int x0 = (x[i].qs[j] & 0xf);
106106
const int x1 = (x[i].qs[j] >> 4);
107107

108-
y[i*qk + j + 0 ] = x0*d + m;
109-
y[i*qk + j + qk/2] = x1*d + m;
108+
y[i*qk + 2*j + 0] = x0*d + m;
109+
y[i*qk + 2*j + 1] = x1*d + m;
110110
}
111111
}
112112

@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
129129
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
130130
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
131131

132-
y[i*qk + j + 0 ] = x0*d;
133-
y[i*qk + j + qk/2] = x1*d;
132+
y[i*qk + 2*j + 0] = x0*d;
133+
y[i*qk + 2*j + 1] = x1*d;
134134
}
135135
}
136136

@@ -154,24 +154,23 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
154154
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
155155
const int x1 = (x[i].qs[j] >> 4) | xh_1;
156156

157-
y[i*qk + j + 0 ] = x0*d + m;
158-
y[i*qk + j + qk/2] = x1*d + m;
157+
y[i*qk + 2*j + 0] = x0*d + m;
158+
y[i*qk + 2*j + 1] = x1*d + m;
159159
}
160160
}
161161

162162
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
163+
static const int qk = QK8_0;
164+
163165
const block_q8_0 * x = (const block_q8_0 *) vx;
164166

165167
const int i = blockIdx.x;
166168

167169
const float d = x[i].d;
168170

169-
const int8_t * pp = x[i].qs;
170-
171-
for (int l = 0; l < QK8_0; l++) {
172-
const int8_t vi = pp[l];
173-
174-
y[i*QK8_0 + l] = vi*d;
171+
for (int j = 0; j < qk/2; ++j) {
172+
y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d;
173+
y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
175174
}
176175
}
177176

ggml-opencl.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
114114
const uint i = get_global_id(0) / 32;
115115
const uint l = get_local_id(0);
116116

117+
// TODO: this is broken
117118
result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
118119
}
119120

ggml.c

Lines changed: 47 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -751,8 +751,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
751751
y[i].d = d;
752752

753753
for (int j = 0; j < qk/2; ++j) {
754-
const float x0 = x[i*qk + 0 + j]*id;
755-
const float x1 = x[i*qk + qk/2 + j]*id;
754+
const float x0 = x[i*qk + 2*j + 0]*id;
755+
const float x1 = x[i*qk + 2*j + 1]*id;
756756

757757
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
758758
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
@@ -792,8 +792,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
792792
y[i].m = min;
793793

794794
for (int j = 0; j < qk/2; ++j) {
795-
const float x0 = (x[i*qk + 0 + j] - min)*id;
796-
const float x1 = (x[i*qk + qk/2 + j] - min)*id;
795+
const float x0 = (x[i*qk + 2*j + 0] - min)*id;
796+
const float x1 = (x[i*qk + 2*j + 1] - min)*id;
797797

798798
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
799799
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
@@ -835,8 +835,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
835835
uint32_t qh = 0;
836836

837837
for (int j = 0; j < qk/2; ++j) {
838-
const float x0 = x[i*qk + 0 + j]*id;
839-
const float x1 = x[i*qk + qk/2 + j]*id;
838+
const float x0 = x[i*qk + 2*j + 0]*id;
839+
const float x1 = x[i*qk + 2*j + 1]*id;
840840

841841
const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
842842
const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
@@ -883,8 +883,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
883883
uint32_t qh = 0;
884884

885885
for (int j = 0; j < qk/2; ++j) {
886-
const float x0 = (x[i*qk + 0 + j] - min)*id;
887-
const float x1 = (x[i*qk + qk/2 + j] - min)*id;
886+
const float x0 = (x[i*qk + 2*j + 0] - min)*id;
887+
const float x1 = (x[i*qk + 2*j + 1] - min)*id;
888888

889889
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
890890
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
@@ -922,10 +922,12 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
922922

923923
y[i].d = d;
924924

925-
for (int j = 0; j < QK8_0; ++j) {
926-
const float v0 = x[i*QK8_0 + j]*id;
925+
for (int j = 0; j < QK8_0/2; ++j) {
926+
const float v0 = x[i*QK8_0 + 2*j + 0]*id;
927+
const float v1 = x[i*QK8_0 + 2*j + 1]*id;
927928

928-
y[i].qs[j] = roundf(v0);
929+
y[i].qs[ j] = v0 + 0.5f;
930+
y[i].qs[QK8_0/2 + j] = v1 + 0.5f;
929931
}
930932
}
931933
}
@@ -943,12 +945,12 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
943945
float32x4_t asrcv[8];
944946
float32x4_t amaxv[8];
945947

946-
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
947-
for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
948+
for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j);
949+
for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
948950

949-
for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
950-
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
951-
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
951+
for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
952+
for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
953+
for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
952954

953955
const float amax = vmaxvq_f32(amaxv[0]);
954956

@@ -957,14 +959,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
957959

958960
y[i].d = d;
959961

960-
for (int l = 0; l < 8; l++) {
961-
const float32x4_t v = vmulq_n_f32(srcv[l], id);
962+
for (int j = 0; j < 8; j++) {
963+
const float32x4_t v = vmulq_n_f32(srcv[j], id);
962964
const int32x4_t vi = vcvtnq_s32_f32(v);
963965

964-
y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
965-
y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
966-
y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
967-
y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
966+
y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0);
967+
y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
968+
y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2);
969+
y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
968970
}
969971
}
970972
#elif defined(__AVX2__) || defined(__AVX__)
@@ -1080,11 +1082,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
10801082
int sum1 = 0;
10811083

10821084
for (int j = 0; j < QK8_1/2; ++j) {
1083-
const float v0 = x[i*QK8_1 + j]*id;
1084-
const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
1085+
const float v0 = x[i*QK8_1 + 2*j + 0]*id;
1086+
const float v1 = x[i*QK8_1 + 2*j + 1]*id;
10851087

1086-
y[i].qs[ j] = roundf(v0);
1087-
y[i].qs[QK8_1/2 + j] = roundf(v1);
1088+
y[i].qs[ j] = v0 + 0.5f;
1089+
y[i].qs[QK8_1/2 + j] = v1 + 0.5f;
10881090

10891091
sum0 += y[i].qs[ j];
10901092
sum1 += y[i].qs[QK8_1/2 + j];
@@ -1129,10 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
11291131
const float32x4_t v = vmulq_n_f32(srcv[j], id);
11301132
const int32x4_t vi = vcvtnq_s32_f32(v);
11311133

1132-
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
1133-
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
1134-
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
1135-
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1134+
y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0);
1135+
y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
1136+
y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2);
1137+
y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
11361138

11371139
accv0 = vaddq_s32(accv0, vi);
11381140
}
@@ -1142,10 +1144,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
11421144
const float32x4_t v = vmulq_n_f32(srcv[j], id);
11431145
const int32x4_t vi = vcvtnq_s32_f32(v);
11441146

1145-
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
1146-
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
1147-
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
1148-
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
1147+
y[i].qs[ 2*j + 0] = vgetq_lane_s32(vi, 0);
1148+
y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
1149+
y[i].qs[ 2*j + 1] = vgetq_lane_s32(vi, 2);
1150+
y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
11491151

11501152
accv1 = vaddq_s32(accv1, vi);
11511153
}
@@ -1271,8 +1273,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
12711273
const int x0 = (x[i].qs[j] & 0x0F) - 8;
12721274
const int x1 = (x[i].qs[j] >> 4) - 8;
12731275

1274-
y[i*qk + j + 0 ] = x0*d;
1275-
y[i*qk + j + qk/2] = x1*d;
1276+
y[i*qk + 2*j + 0] = x0*d;
1277+
y[i*qk + 2*j + 1] = x1*d;
12761278
}
12771279
}
12781280
}
@@ -1292,8 +1294,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
12921294
const int x0 = (x[i].qs[j] & 0x0F);
12931295
const int x1 = (x[i].qs[j] >> 4);
12941296

1295-
y[i*qk + j + 0 ] = x0*d + m;
1296-
y[i*qk + j + qk/2] = x1*d + m;
1297+
y[i*qk + 2*j + 0] = x0*d + m;
1298+
y[i*qk + 2*j + 1] = x1*d + m;
12971299
}
12981300
}
12991301
}
@@ -1318,8 +1320,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
13181320
const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
13191321
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
13201322

1321-
y[i*qk + j + 0 ] = x0*d;
1322-
y[i*qk + j + qk/2] = x1*d;
1323+
y[i*qk + 2*j + 0] = x0*d;
1324+
y[i*qk + 2*j + 1] = x1*d;
13231325
}
13241326
}
13251327
}
@@ -1345,8 +1347,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
13451347
const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
13461348
const int x1 = (x[i].qs[j] >> 4) | xh_1;
13471349

1348-
y[i*qk + j + 0 ] = x0*d + m;
1349-
y[i*qk + j + qk/2] = x1*d + m;
1350+
y[i*qk + 2*j + 0] = x0*d + m;
1351+
y[i*qk + 2*j + 1] = x1*d + m;
13501352
}
13511353
}
13521354
}
@@ -1363,8 +1365,9 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
13631365
for (int i = 0; i < nb; i++) {
13641366
const float d = x[i].d;
13651367

1366-
for (int j = 0; j < qk; ++j) {
1367-
y[i*qk + j] = x[i].qs[j]*d;
1368+
for (int j = 0; j < qk/2; ++j) {
1369+
y[i*qk + 2*j + 0] = x[i].qs[j + 0 ]*d;
1370+
y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
13681371
}
13691372
}
13701373
}

llama.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -919,9 +919,7 @@ static void llama_model_load_internal(
919919
}
920920

921921
if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
922-
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
923-
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
924-
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
922+
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
925923
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
926924
}
927925
}

0 commit comments

Comments
 (0)