@@ -3855,7 +3855,7 @@ static inline __m128i get_scale_shuffle(int i) {
3855
3855
}
3856
3856
#endif
3857
3857
3858
- void ggml_vec_dot_q4_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bbx , const void * restrict vy , size_t bby , int nrc ) {
3858
+ void ggml_vec_dot_q4_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bx , const void * restrict vy , size_t by , int nrc ) {
3859
3859
const int qk = QK8_0 ;
3860
3860
const int nb = n / qk ;
3861
3861
@@ -3866,8 +3866,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3866
3866
assert (nrc == 1 );
3867
3867
#endif
3868
3868
UNUSED (nrc );
3869
- UNUSED (bbx );
3870
- UNUSED (bby );
3869
+ UNUSED (bx );
3870
+ UNUSED (by );
3871
3871
UNUSED (bs );
3872
3872
3873
3873
const block_q4_0 * restrict x = vx ;
@@ -4024,15 +4024,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4024
4024
4025
4025
const __m128i tmp = _mm_loadu_si128 ((const __m128i * )x [i ].qs );
4026
4026
4027
- __m128i bx = _mm_and_si128 (lowMask , tmp );
4028
- __m128i by = _mm_loadu_si128 ((const __m128i * )y [i ].qs );
4029
- bx = _mm_sub_epi8 (bx , off );
4030
- const __m128i i32_0 = mul_sum_i8_pairs (bx , by );
4027
+ __m128i bx_0 = _mm_and_si128 (lowMask , tmp );
4028
+ __m128i by_0 = _mm_loadu_si128 ((const __m128i * )y [i ].qs );
4029
+ bx_0 = _mm_sub_epi8 (bx_0 , off );
4030
+ const __m128i i32_0 = mul_sum_i8_pairs (bx_0 , by_0 );
4031
4031
4032
- bx = _mm_and_si128 (lowMask , _mm_srli_epi64 (tmp , 4 ));
4033
- by = _mm_loadu_si128 ((const __m128i * )(y [i ].qs + 16 ));
4034
- bx = _mm_sub_epi8 (bx , off );
4035
- const __m128i i32_1 = mul_sum_i8_pairs (bx , by );
4032
+ bx_0 = _mm_and_si128 (lowMask , _mm_srli_epi64 (tmp , 4 ));
4033
+ by_0 = _mm_loadu_si128 ((const __m128i * )(y [i ].qs + 16 ));
4034
+ bx_0 = _mm_sub_epi8 (bx_0 , off );
4035
+ const __m128i i32_1 = mul_sum_i8_pairs (bx_0 , by_0 );
4036
4036
4037
4037
// Convert int32_t to float
4038
4038
__m256 p = _mm256_cvtepi32_ps (MM256_SET_M128I (i32_0 , i32_1 ));
@@ -4222,7 +4222,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4222
4222
#endif
4223
4223
}
4224
4224
4225
- void ggml_vec_dot_q4_1_q8_1 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bbx , const void * restrict vy , size_t bby , int nrc ) {
4225
+ void ggml_vec_dot_q4_1_q8_1 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bx , const void * restrict vy , size_t by , int nrc ) {
4226
4226
const int qk = QK8_1 ;
4227
4227
const int nb = n / qk ;
4228
4228
@@ -4233,8 +4233,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4233
4233
assert (nrc == 1 );
4234
4234
#endif
4235
4235
UNUSED (nrc );
4236
- UNUSED (bbx );
4237
- UNUSED (bby );
4236
+ UNUSED (bx );
4237
+ UNUSED (by );
4238
4238
UNUSED (bs );
4239
4239
4240
4240
const block_q4_1 * restrict x = vx ;
@@ -4440,16 +4440,16 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4440
4440
#endif
4441
4441
}
4442
4442
4443
- void ggml_vec_dot_q5_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bbx , const void * restrict vy , size_t bby , int nrc ) {
4443
+ void ggml_vec_dot_q5_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bx , const void * restrict vy , size_t by , int nrc ) {
4444
4444
const int qk = QK8_0 ;
4445
4445
const int nb = n / qk ;
4446
4446
4447
4447
assert (n % qk == 0 );
4448
4448
assert (qk == QK5_0 );
4449
4449
assert (nrc == 1 );
4450
4450
UNUSED (nrc );
4451
- UNUSED (bbx );
4452
- UNUSED (bby );
4451
+ UNUSED (bx );
4452
+ UNUSED (by );
4453
4453
UNUSED (bs );
4454
4454
4455
4455
const block_q5_0 * restrict x = vx ;
@@ -4618,21 +4618,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4618
4618
/* Compute combined scale for the block */
4619
4619
const __m256 d = _mm256_set1_ps (GGML_FP16_TO_FP32 (x [i ].d ) * GGML_FP16_TO_FP32 (y [i ].d ));
4620
4620
4621
- __m256i bx = bytes_from_nibbles_32 (x [i ].qs );
4621
+ __m256i bx_0 = bytes_from_nibbles_32 (x [i ].qs );
4622
4622
const __m256i bxhi = bytes_from_bits_32 (x [i ].qh );
4623
4623
__m128i bxhil = _mm256_castsi256_si128 (bxhi );
4624
4624
__m128i bxhih = _mm256_extractf128_si256 (bxhi , 1 );
4625
4625
bxhil = _mm_andnot_si128 (bxhil , mask );
4626
4626
bxhih = _mm_andnot_si128 (bxhih , mask );
4627
- __m128i bxl = _mm256_castsi256_si128 (bx );
4628
- __m128i bxh = _mm256_extractf128_si256 (bx , 1 );
4627
+ __m128i bxl = _mm256_castsi256_si128 (bx_0 );
4628
+ __m128i bxh = _mm256_extractf128_si256 (bx_0 , 1 );
4629
4629
bxl = _mm_or_si128 (bxl , bxhil );
4630
4630
bxh = _mm_or_si128 (bxh , bxhih );
4631
- bx = MM256_SET_M128I (bxh , bxl );
4631
+ bx_0 = MM256_SET_M128I (bxh , bxl );
4632
4632
4633
- const __m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4633
+ const __m256i by_0 = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4634
4634
4635
- const __m256 q = mul_sum_i8_pairs_float (bx , by );
4635
+ const __m256 q = mul_sum_i8_pairs_float (bx_0 , by_0 );
4636
4636
4637
4637
/* Multiply q with scale and accumulate */
4638
4638
acc = _mm256_add_ps (_mm256_mul_ps (d , q ), acc );
@@ -4731,16 +4731,16 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4731
4731
#endif
4732
4732
}
4733
4733
4734
- void ggml_vec_dot_q5_1_q8_1 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bbx , const void * restrict vy , size_t bby , int nrc ) {
4734
+ void ggml_vec_dot_q5_1_q8_1 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bx , const void * restrict vy , size_t by , int nrc ) {
4735
4735
const int qk = QK8_1 ;
4736
4736
const int nb = n / qk ;
4737
4737
4738
4738
assert (n % qk == 0 );
4739
4739
assert (qk == QK5_1 );
4740
4740
assert (nrc == 1 );
4741
4741
UNUSED (nrc );
4742
- UNUSED (bbx );
4743
- UNUSED (bby );
4742
+ UNUSED (bx );
4743
+ UNUSED (by );
4744
4744
UNUSED (bs );
4745
4745
4746
4746
const block_q5_1 * restrict x = vx ;
@@ -4925,22 +4925,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4925
4925
4926
4926
summs += GGML_FP16_TO_FP32 (x [i ].m ) * y [i ].s ;
4927
4927
4928
- __m256i bx = bytes_from_nibbles_32 (x [i ].qs );
4928
+ __m256i bx_0 = bytes_from_nibbles_32 (x [i ].qs );
4929
4929
const __m256i bxhi = bytes_from_bits_32 (x [i ].qh );
4930
4930
__m128i bxhil = _mm256_castsi256_si128 (bxhi );
4931
4931
__m128i bxhih = _mm256_extractf128_si256 (bxhi , 1 );
4932
4932
bxhil = _mm_and_si128 (bxhil , mask );
4933
4933
bxhih = _mm_and_si128 (bxhih , mask );
4934
- __m128i bxl = _mm256_castsi256_si128 (bx );
4935
- __m128i bxh = _mm256_extractf128_si256 (bx , 1 );
4934
+ __m128i bxl = _mm256_castsi256_si128 (bx_0 );
4935
+ __m128i bxh = _mm256_extractf128_si256 (bx_0 , 1 );
4936
4936
bxl = _mm_or_si128 (bxl , bxhil );
4937
4937
bxh = _mm_or_si128 (bxh , bxhih );
4938
- bx = MM256_SET_M128I (bxh , bxl );
4938
+ bx_0 = MM256_SET_M128I (bxh , bxl );
4939
4939
4940
4940
const __m256 dy = _mm256_set1_ps (y [i ].d );
4941
- const __m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4941
+ const __m256i by_0 = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
4942
4942
4943
- const __m256 q = mul_sum_us8_pairs_float (bx , by );
4943
+ const __m256 q = mul_sum_us8_pairs_float (bx_0 , by_0 );
4944
4944
4945
4945
acc = _mm256_add_ps (_mm256_mul_ps (q , _mm256_mul_ps (dx , dy )), acc );
4946
4946
}
@@ -5035,7 +5035,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5035
5035
#endif
5036
5036
}
5037
5037
5038
- void ggml_vec_dot_q8_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bbx , const void * restrict vy , size_t bby , int nrc ) {
5038
+ void ggml_vec_dot_q8_0_q8_0 (int n , float * restrict s , size_t bs , const void * restrict vx , size_t bx , const void * restrict vy , size_t by , int nrc ) {
5039
5039
const int qk = QK8_0 ;
5040
5040
const int nb = n / qk ;
5041
5041
@@ -5046,8 +5046,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5046
5046
assert (nrc == 1 );
5047
5047
#endif
5048
5048
UNUSED (nrc );
5049
- UNUSED (bbx );
5050
- UNUSED (bby );
5049
+ UNUSED (bx );
5050
+ UNUSED (by );
5051
5051
UNUSED (bs );
5052
5052
5053
5053
const block_q8_0 * restrict x = vx ;
@@ -5169,10 +5169,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5169
5169
5170
5170
for (int i = 0 ; i < nb ; i ++ ) {
5171
5171
// load elements
5172
- vint8m1_t bx = __riscv_vle8_v_i8m1 (x [i ].qs , vl );
5173
- vint8m1_t by = __riscv_vle8_v_i8m1 (y [i ].qs , vl );
5172
+ vint8m1_t bx_0 = __riscv_vle8_v_i8m1 (x [i ].qs , vl );
5173
+ vint8m1_t by_0 = __riscv_vle8_v_i8m1 (y [i ].qs , vl );
5174
5174
5175
- vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2 (bx , by , vl );
5175
+ vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2 (bx_0 , by_0 , vl );
5176
5176
5177
5177
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1 (0 , vl );
5178
5178
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1 (vw_mul , v_zero , vl );
0 commit comments