@@ -751,8 +751,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
751
751
y [i ].d = d ;
752
752
753
753
for (int j = 0 ; j < qk /2 ; ++ j ) {
754
- const float x0 = x [i * qk + 0 + j ]* id ;
755
- const float x1 = x [i * qk + qk / 2 + j ]* id ;
754
+ const float x0 = x [i * qk + 2 * j + 0 ]* id ;
755
+ const float x1 = x [i * qk + 2 * j + 1 ]* id ;
756
756
757
757
const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 8.5f ));
758
758
const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 8.5f ));
@@ -792,8 +792,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
792
792
y [i ].m = min ;
793
793
794
794
for (int j = 0 ; j < qk /2 ; ++ j ) {
795
- const float x0 = (x [i * qk + 0 + j ] - min )* id ;
796
- const float x1 = (x [i * qk + qk / 2 + j ] - min )* id ;
795
+ const float x0 = (x [i * qk + 2 * j + 0 ] - min )* id ;
796
+ const float x1 = (x [i * qk + 2 * j + 1 ] - min )* id ;
797
797
798
798
const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 0.5f ));
799
799
const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 0.5f ));
@@ -835,8 +835,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
835
835
uint32_t qh = 0 ;
836
836
837
837
for (int j = 0 ; j < qk /2 ; ++ j ) {
838
- const float x0 = x [i * qk + 0 + j ]* id ;
839
- const float x1 = x [i * qk + qk / 2 + j ]* id ;
838
+ const float x0 = x [i * qk + 2 * j + 0 ]* id ;
839
+ const float x1 = x [i * qk + 2 * j + 1 ]* id ;
840
840
841
841
const uint8_t xi0 = MIN (31 , (int8_t )(x0 + 16.5f ));
842
842
const uint8_t xi1 = MIN (31 , (int8_t )(x1 + 16.5f ));
@@ -883,8 +883,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
883
883
uint32_t qh = 0 ;
884
884
885
885
for (int j = 0 ; j < qk /2 ; ++ j ) {
886
- const float x0 = (x [i * qk + 0 + j ] - min )* id ;
887
- const float x1 = (x [i * qk + qk / 2 + j ] - min )* id ;
886
+ const float x0 = (x [i * qk + 2 * j + 0 ] - min )* id ;
887
+ const float x1 = (x [i * qk + 2 * j + 1 ] - min )* id ;
888
888
889
889
const uint8_t xi0 = (uint8_t )(x0 + 0.5f );
890
890
const uint8_t xi1 = (uint8_t )(x1 + 0.5f );
@@ -922,10 +922,12 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
922
922
923
923
y [i ].d = d ;
924
924
925
- for (int j = 0 ; j < QK8_0 ; ++ j ) {
926
- const float v0 = x [i * QK8_0 + j ]* id ;
925
+ for (int j = 0 ; j < QK8_0 /2 ; ++ j ) {
926
+ const float v0 = x [i * QK8_0 + 2 * j + 0 ]* id ;
927
+ const float v1 = x [i * QK8_0 + 2 * j + 1 ]* id ;
927
928
928
- y [i ].qs [j ] = roundf (v0 );
929
+ y [i ].qs [ j ] = v0 + 0.5f ;
930
+ y [i ].qs [QK8_0 /2 + j ] = v1 + 0.5f ;
929
931
}
930
932
}
931
933
}
@@ -943,12 +945,12 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
943
945
float32x4_t asrcv [8 ];
944
946
float32x4_t amaxv [8 ];
945
947
946
- for (int l = 0 ; l < 8 ; l ++ ) srcv [l ] = vld1q_f32 (x + i * 32 + 4 * l );
947
- for (int l = 0 ; l < 8 ; l ++ ) asrcv [l ] = vabsq_f32 (srcv [l ]);
948
+ for (int j = 0 ; j < 8 ; j ++ ) srcv [j ] = vld1q_f32 (x + i * 32 + 4 * j );
949
+ for (int j = 0 ; j < 8 ; j ++ ) asrcv [j ] = vabsq_f32 (srcv [j ]);
948
950
949
- for (int l = 0 ; l < 4 ; l ++ ) amaxv [2 * l ] = vmaxq_f32 (asrcv [2 * l ], asrcv [2 * l + 1 ]);
950
- for (int l = 0 ; l < 2 ; l ++ ) amaxv [4 * l ] = vmaxq_f32 (amaxv [4 * l ], amaxv [4 * l + 2 ]);
951
- for (int l = 0 ; l < 1 ; l ++ ) amaxv [8 * l ] = vmaxq_f32 (amaxv [8 * l ], amaxv [8 * l + 4 ]);
951
+ for (int j = 0 ; j < 4 ; j ++ ) amaxv [2 * j ] = vmaxq_f32 (asrcv [2 * j ], asrcv [2 * j + 1 ]);
952
+ for (int j = 0 ; j < 2 ; j ++ ) amaxv [4 * j ] = vmaxq_f32 (amaxv [4 * j ], amaxv [4 * j + 2 ]);
953
+ for (int j = 0 ; j < 1 ; j ++ ) amaxv [8 * j ] = vmaxq_f32 (amaxv [8 * j ], amaxv [8 * j + 4 ]);
952
954
953
955
const float amax = vmaxvq_f32 (amaxv [0 ]);
954
956
@@ -957,14 +959,14 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
957
959
958
960
y [i ].d = d ;
959
961
960
- for (int l = 0 ; l < 8 ; l ++ ) {
961
- const float32x4_t v = vmulq_n_f32 (srcv [l ], id );
962
+ for (int j = 0 ; j < 8 ; j ++ ) {
963
+ const float32x4_t v = vmulq_n_f32 (srcv [j ], id );
962
964
const int32x4_t vi = vcvtnq_s32_f32 (v );
963
965
964
- y [i ].qs [4 * l + 0 ] = vgetq_lane_s32 (vi , 0 );
965
- y [i ].qs [4 * l + 1 ] = vgetq_lane_s32 (vi , 1 );
966
- y [i ].qs [4 * l + 2 ] = vgetq_lane_s32 (vi , 2 );
967
- y [i ].qs [4 * l + 3 ] = vgetq_lane_s32 (vi , 3 );
966
+ y [i ].qs [ 2 * j + 0 ] = vgetq_lane_s32 (vi , 0 );
967
+ y [i ].qs [16 + 2 * j + 0 ] = vgetq_lane_s32 (vi , 1 );
968
+ y [i ].qs [ 2 * j + 1 ] = vgetq_lane_s32 (vi , 2 );
969
+ y [i ].qs [16 + 2 * j + 1 ] = vgetq_lane_s32 (vi , 3 );
968
970
}
969
971
}
970
972
#elif defined(__AVX2__ ) || defined(__AVX__ )
@@ -1080,11 +1082,11 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
1080
1082
int sum1 = 0 ;
1081
1083
1082
1084
for (int j = 0 ; j < QK8_1 /2 ; ++ j ) {
1083
- const float v0 = x [i * QK8_1 + j ]* id ;
1084
- const float v1 = x [i * QK8_1 + QK8_1 / 2 + j ]* id ;
1085
+ const float v0 = x [i * QK8_1 + 2 * j + 0 ]* id ;
1086
+ const float v1 = x [i * QK8_1 + 2 * j + 1 ]* id ;
1085
1087
1086
- y [i ].qs [ j ] = roundf ( v0 ) ;
1087
- y [i ].qs [QK8_1 /2 + j ] = roundf ( v1 ) ;
1088
+ y [i ].qs [ j ] = v0 + 0.5f ;
1089
+ y [i ].qs [QK8_1 /2 + j ] = v1 + 0.5f ;
1088
1090
1089
1091
sum0 += y [i ].qs [ j ];
1090
1092
sum1 += y [i ].qs [QK8_1 /2 + j ];
@@ -1129,10 +1131,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1129
1131
const float32x4_t v = vmulq_n_f32 (srcv [j ], id );
1130
1132
const int32x4_t vi = vcvtnq_s32_f32 (v );
1131
1133
1132
- y [i ].qs [4 * j + 0 ] = vgetq_lane_s32 (vi , 0 );
1133
- y [i ].qs [4 * j + 1 ] = vgetq_lane_s32 (vi , 1 );
1134
- y [i ].qs [4 * j + 2 ] = vgetq_lane_s32 (vi , 2 );
1135
- y [i ].qs [4 * j + 3 ] = vgetq_lane_s32 (vi , 3 );
1134
+ y [i ].qs [ 2 * j + 0 ] = vgetq_lane_s32 (vi , 0 );
1135
+ y [i ].qs [16 + 2 * j + 0 ] = vgetq_lane_s32 (vi , 1 );
1136
+ y [i ].qs [ 2 * j + 1 ] = vgetq_lane_s32 (vi , 2 );
1137
+ y [i ].qs [16 + 2 * j + 1 ] = vgetq_lane_s32 (vi , 3 );
1136
1138
1137
1139
accv0 = vaddq_s32 (accv0 , vi );
1138
1140
}
@@ -1142,10 +1144,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1142
1144
const float32x4_t v = vmulq_n_f32 (srcv [j ], id );
1143
1145
const int32x4_t vi = vcvtnq_s32_f32 (v );
1144
1146
1145
- y [i ].qs [4 * j + 0 ] = vgetq_lane_s32 (vi , 0 );
1146
- y [i ].qs [4 * j + 1 ] = vgetq_lane_s32 (vi , 1 );
1147
- y [i ].qs [4 * j + 2 ] = vgetq_lane_s32 (vi , 2 );
1148
- y [i ].qs [4 * j + 3 ] = vgetq_lane_s32 (vi , 3 );
1147
+ y [i ].qs [ 2 * j + 0 ] = vgetq_lane_s32 (vi , 0 );
1148
+ y [i ].qs [16 + 2 * j + 0 ] = vgetq_lane_s32 (vi , 1 );
1149
+ y [i ].qs [ 2 * j + 1 ] = vgetq_lane_s32 (vi , 2 );
1150
+ y [i ].qs [16 + 2 * j + 1 ] = vgetq_lane_s32 (vi , 3 );
1149
1151
1150
1152
accv1 = vaddq_s32 (accv1 , vi );
1151
1153
}
@@ -1271,8 +1273,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1271
1273
const int x0 = (x [i ].qs [j ] & 0x0F ) - 8 ;
1272
1274
const int x1 = (x [i ].qs [j ] >> 4 ) - 8 ;
1273
1275
1274
- y [i * qk + j + 0 ] = x0 * d ;
1275
- y [i * qk + j + qk / 2 ] = x1 * d ;
1276
+ y [i * qk + 2 * j + 0 ] = x0 * d ;
1277
+ y [i * qk + 2 * j + 1 ] = x1 * d ;
1276
1278
}
1277
1279
}
1278
1280
}
@@ -1292,8 +1294,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
1292
1294
const int x0 = (x [i ].qs [j ] & 0x0F );
1293
1295
const int x1 = (x [i ].qs [j ] >> 4 );
1294
1296
1295
- y [i * qk + j + 0 ] = x0 * d + m ;
1296
- y [i * qk + j + qk / 2 ] = x1 * d + m ;
1297
+ y [i * qk + 2 * j + 0 ] = x0 * d + m ;
1298
+ y [i * qk + 2 * j + 1 ] = x1 * d + m ;
1297
1299
}
1298
1300
}
1299
1301
}
@@ -1318,8 +1320,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
1318
1320
const int32_t x0 = ((x [i ].qs [j ] & 0x0F ) | xh_0 ) - 16 ;
1319
1321
const int32_t x1 = ((x [i ].qs [j ] >> 4 ) | xh_1 ) - 16 ;
1320
1322
1321
- y [i * qk + j + 0 ] = x0 * d ;
1322
- y [i * qk + j + qk / 2 ] = x1 * d ;
1323
+ y [i * qk + 2 * j + 0 ] = x0 * d ;
1324
+ y [i * qk + 2 * j + 1 ] = x1 * d ;
1323
1325
}
1324
1326
}
1325
1327
}
@@ -1345,8 +1347,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
1345
1347
const int x0 = (x [i ].qs [j ] & 0x0F ) | xh_0 ;
1346
1348
const int x1 = (x [i ].qs [j ] >> 4 ) | xh_1 ;
1347
1349
1348
- y [i * qk + j + 0 ] = x0 * d + m ;
1349
- y [i * qk + j + qk / 2 ] = x1 * d + m ;
1350
+ y [i * qk + 2 * j + 0 ] = x0 * d + m ;
1351
+ y [i * qk + 2 * j + 1 ] = x1 * d + m ;
1350
1352
}
1351
1353
}
1352
1354
}
@@ -1363,8 +1365,9 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
1363
1365
for (int i = 0 ; i < nb ; i ++ ) {
1364
1366
const float d = x [i ].d ;
1365
1367
1366
- for (int j = 0 ; j < qk ; ++ j ) {
1367
- y [i * qk + j ] = x [i ].qs [j ]* d ;
1368
+ for (int j = 0 ; j < qk /2 ; ++ j ) {
1369
+ y [i * qk + 2 * j + 0 ] = x [i ].qs [j + 0 ]* d ;
1370
+ y [i * qk + 2 * j + 1 ] = x [i ].qs [j + qk /2 ]* d ;
1368
1371
}
1369
1372
}
1370
1373
}
0 commit comments