ggml : preserve old Q4 and Q5 formats

ggerganov · ggerganov · commit f9bbbe3aee46 · 2023-05-11T20:06:35.000+03:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) - 8;
         const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
+        y[i*qk + 2*j + 0] = x0*d;
+        y[i*qk + 2*j + 1] = x1*d;
     }
 }
 
@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf);
         const int x1 = (x[i].qs[j] >>  4);
 
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
+        y[i*qk + 2*j + 0] = x0*d + m;
+        y[i*qk + 2*j + 1] = x1*d + m;
     }
 }
 
@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
         const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
         const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-        y[i*qk + j + 0   ] = x0*d;
-        y[i*qk + j + qk/2] = x1*d;
+        y[i*qk + 2*j + 0] = x0*d;
+        y[i*qk + 2*j + 1] = x1*d;
     }
 }
 
@@ -154,24 +154,23 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) | xh_0;
         const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-        y[i*qk + j + 0   ] = x0*d + m;
-        y[i*qk + j + qk/2] = x1*d + m;
+        y[i*qk + 2*j + 0] = x0*d + m;
+        y[i*qk + 2*j + 1] = x1*d + m;
     }
 }
 
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    static const int qk = QK8_0;
+
     const block_q8_0 * x = (const block_q8_0 *) vx;
 
     const int i = blockIdx.x;
 
     const float d = x[i].d;
 
-    const int8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK8_0; l++) {
-        const int8_t vi = pp[l];
-
-        y[i*QK8_0 + l] = vi*d;
+    for (int j = 0; j < qk/2; ++j) {
+        y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
+        y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
     }
 }
 
diff --git a/ggml-opencl.c b/ggml-opencl.c
@@ -114,6 +114,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
     const uint i = get_global_id(0) / 32;
     const uint l = get_local_id(0);
 
+    // TODO: this is broken
     result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 }
 
diff --git a/ggml.c b/ggml.c
@@ -751,8 +751,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         y[i].d = d;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
+            const float x0 = x[i*qk + 2*j + 0]*id;
+            const float x1 = x[i*qk + 2*j + 1]*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
@@ -792,8 +792,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].m = min;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
+            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
@@ -835,8 +835,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
+            const float x0 = x[i*qk + 2*j + 0]*id;
+            const float x1 = x[i*qk + 2*j + 1]*id;
 
             const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
             const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
@@ -883,8 +883,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
+            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
 
             const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
             const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
@@ -922,10 +922,12 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        for (int j = 0; j < QK8_0; ++j) {
-            const float v0 = x[i*QK8_0 + j]*id;
+        for (int j = 0; j < QK8_0/2; ++j) {
+            const float v0 = x[i*QK8_0 + 2*j + 0]*id;
+            const float v1 = x[i*QK8_0 + 2*j + 1]*id;
 
-            y[i].qs[j] = roundf(v0);
+            y[i].qs[          j] = v0 + 0.5f;
+            y[i].qs[QK8_0/2 + j] = v1 + 0.5f;
         }
     }
 }
@@ -943,12 +945,12 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
 
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
 
         const float amax = vmaxvq_f32(amaxv[0]);
 
@@ -957,14 +959,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
 
         y[i].d = d;
 
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
         }
     }
 #elif defined(__AVX2__) || defined(__AVX__)
@@ -1080,11 +1082,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
         int sum1 = 0;
 
         for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1           + j]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
+            const float v0 = x[i*QK8_1 + 2*j + 0]*id;
+            const float v1 = x[i*QK8_1 + 2*j + 1]*id;
 
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_1/2 + j] = roundf(v1);
+            y[i].qs[          j] = v0 + 0.5f;
+            y[i].qs[QK8_1/2 + j] = v1 + 0.5f;
 
             sum0 += y[i].qs[          j];
             sum1 += y[i].qs[QK8_1/2 + j];
@@ -1129,10 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
 
             accv0 = vaddq_s32(accv0, vi);
         }
@@ -1142,10 +1144,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
+            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
+            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
 
             accv1 = vaddq_s32(accv1, vi);
         }
@@ -1271,8 +1273,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) - 8;
             const int x1 = (x[i].qs[j] >>   4) - 8;
 
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
+            y[i*qk + 2*j + 0] = x0*d;
+            y[i*qk + 2*j + 1] = x1*d;
         }
     }
 }
@@ -1292,8 +1294,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F);
             const int x1 = (x[i].qs[j] >>   4);
 
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
+            y[i*qk + 2*j + 0] = x0*d + m;
+            y[i*qk + 2*j + 1] = x1*d + m;
         }
     }
 }
@@ -1318,8 +1320,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
+            y[i*qk + 2*j + 0] = x0*d;
+            y[i*qk + 2*j + 1] = x1*d;
         }
     }
 }
@@ -1345,8 +1347,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
             const int x1 = (x[i].qs[j] >>   4) | xh_1;
 
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
+            y[i*qk + 2*j + 0] = x0*d + m;
+            y[i*qk + 2*j + 1] = x1*d + m;
         }
     }
 }
@@ -1363,8 +1365,9 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
+        for (int j = 0; j < qk/2; ++j) {
+            y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
+            y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
         }
     }
 }
diff --git a/llama.cpp b/llama.cpp
@@ -919,9 +919,7 @@ static void llama_model_load_internal(
     }
 
     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
-        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
-            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
-            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
+        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {`
`86`	`86`	`const int x0 = (x[i].qs[j] & 0xf) - 8;`
`87`	`87`	`const int x1 = (x[i].qs[j] >> 4) - 8;`
`88`	`88`
`89`		`- y[iqk + j + 0 ] = x0d;`
`90`		`- y[iqk + j + qk/2] = x1d;`
	`89`	`+ y[iqk + 2j + 0] = x0*d;`
	`90`	`+ y[iqk + 2j + 1] = x1*d;`
`91`	`91`	`}`
`92`	`92`	`}`
`93`	`93`
`@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {`
`105`	`105`	`const int x0 = (x[i].qs[j] & 0xf);`
`106`	`106`	`const int x1 = (x[i].qs[j] >> 4);`
`107`	`107`
`108`		`- y[iqk + j + 0 ] = x0d + m;`
`109`		`- y[iqk + j + qk/2] = x1d + m;`
	`108`	`+ y[iqk + 2j + 0] = x0*d + m;`
	`109`	`+ y[iqk + 2j + 1] = x1*d + m;`
`110`	`110`	`}`
`111`	`111`	`}`
`112`	`112`
`@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {`
`129`	`129`	`const int32_t x0 = ((x[i].qs[j] & 0xf) \| xh_0) - 16;`
`130`	`130`	`const int32_t x1 = ((x[i].qs[j] >> 4) \| xh_1) - 16;`
`131`	`131`
`132`		`- y[iqk + j + 0 ] = x0d;`
`133`		`- y[iqk + j + qk/2] = x1d;`
	`132`	`+ y[iqk + 2j + 0] = x0*d;`
	`133`	`+ y[iqk + 2j + 1] = x1*d;`
`134`	`134`	`}`
`135`	`135`	`}`
`136`	`136`
`@@ -154,24 +154,23 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {`
`154`	`154`	`const int x0 = (x[i].qs[j] & 0xf) \| xh_0;`
`155`	`155`	`const int x1 = (x[i].qs[j] >> 4) \| xh_1;`
`156`	`156`
`157`		`- y[iqk + j + 0 ] = x0d + m;`
`158`		`- y[iqk + j + qk/2] = x1d + m;`
	`157`	`+ y[iqk + 2j + 0] = x0*d + m;`
	`158`	`+ y[iqk + 2j + 1] = x1*d + m;`
`159`	`159`	`}`
`160`	`160`	`}`
`161`	`161`
`162`	`162`	`static __global__ void dequantize_block_q8_0(const void * vx, float * y) {`
	`163`	`+ static const int qk = QK8_0;`
	`164`	`+`
`163`	`165`	`const block_q8_0 * x = (const block_q8_0 *) vx;`
`164`	`166`
`165`	`167`	`const int i = blockIdx.x;`
`166`	`168`
`167`	`169`	`const float d = x[i].d;`
`168`	`170`
`169`		`- const int8_t * pp = x[i].qs;`
`170`		`-`
`171`		`- for (int l = 0; l < QK8_0; l++) {`
`172`		`- const int8_t vi = pp[l];`
`173`		`-`
`174`		`- y[iQK8_0 + l] = vid;`
	`171`	`+ for (int j = 0; j < qk/2; ++j) {`
	`172`	`+ y[iqk + 2j + 0] = x[i].qs[j + 0 ]*d;`
	`173`	`+ y[iqk + 2j + 1] = x[i].qs[j + qk/2]*d;`
`175`	`174`	`}`
`176`	`175`	`}`
`177`	`176`
Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f`
`114`	`114`	`const uint i = get_global_id(0) / 32;`
`115`	`115`	`const uint l = get_local_id(0);`
`116`	`116`
	`117`	`+ // TODO: this is broken`
`117`	`118`	`result[i32 + l] = blocks[i].qs[l] blocks[i].d;`
`118`	`119`	`}`
`119`	`120`
Original file line number	Diff line number	Diff line change
`@@ -919,9 +919,7 @@ static void llama_model_load_internal(`
`919`	`919`	`}`
`920`	`920`
`921`	`921`	`if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {`
`922`		`- if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&`
`923`		`- hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&`
`924`		`- hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {`
	`922`	`+ if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {`
`925`	`923`	`throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");`
`926`	`924`	`}`
`927`	`925`	`}`