Skip to content

x86_64: BFLOAT16: fix build warning #3389

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions kernel/x86_64/bf16_common_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \
regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \
regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \
regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \
regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \
regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \
regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \
regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n]));


#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \
regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \
regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \
regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \
regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \
regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \
regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \
regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n]));


#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \
Expand Down Expand Up @@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \
reg = _mm256_loadu_si256(x + idx_n);
reg = _mm256_loadu_si256((__m256i *)(x + idx_n));


#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \
reg = _mm_loadu_si128(x + idx_n);
reg = _mm_loadu_si128((__m128i *)(x + idx_n));


#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \
Expand Down
14 changes: 7 additions & 7 deletions kernel/x86_64/sbdot_microk_cooperlake.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
__m256 accum256_1 = _mm256_setzero_ps();
int tail_index_32 = n&(~31);
for (int j = 0; j < tail_index_32; j += 32) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0]));
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16]));
}
accum256 = _mm256_add_ps(accum256, accum256_1);

/* Processing the remaining <32 chunk with 16-elements processing */
if ((n&16) != 0) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32]));
}
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));

/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
}

/* Processing the remaining <8 chunk with masked 8-elements processing */
Expand All @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
} else if (n > 15) { /* n range from 16 to 31 */
/* Processing <32 chunk with 16-elements processing */
__m256 accum256 = _mm256_setzero_ps();
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0]));
accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));

/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16]));
}

/* Processing the remaining <8 chunk with masked 8-elements processing */
Expand All @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
}
} else if (n > 7) { /* n range from 8 to 15 */
/* Processing <16 chunk with 8-elements processing */
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0]));

/* Processing the remaining <8 chunk with masked 8-elements processing */
if ((n&7) != 0) {
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/sbgemm_block_microk_cooperlake.c
Original file line number Diff line number Diff line change
Expand Up @@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat
// K=Any number but will be processed based on 32, M<=16
void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3;
bfloat16 * src_addr0;
bfloat16 * dst_addr0, * dst_addr1;

BLASLONG tag_k_32x = k & (~31);
Expand Down
11 changes: 10 additions & 1 deletion kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Include common macros for BF16 based operations with IA intrinsics
#include "bf16_common_macros.h"

#undef STORE16_COMPLETE_RESULT
#undef STORE16_MASK_COMPLETE_RESULT
#undef STORE8_COMPLETE_RESULT
#undef STORE8_MASK_COMPLETE_RESULT
#undef STORE4_COMPLETE_RESULT
#undef STORE4_MASK_COMPLETE_RESULT

#ifndef ZERO_BETA // Beta is non-zero

#ifndef ONE_BETA // BETA is not ONE
Expand Down Expand Up @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
#ifndef ONE_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
#endif

__m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
Expand Down Expand Up @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);

unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
__mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);

accum512_0 = _mm512_setzero_ps();
Expand Down
Loading