Skip to content

Adjust SkylakeX GEMM3M parameters, add an AVX512 STRMM kernel and fix performance bugs in AVX2 s/c/z GEMM #2422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 29, 2020
2 changes: 1 addition & 1 deletion kernel/x86_64/KERNEL.HASWELL
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c

STRMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
SGEMM_BETA = sgemm_beta_skylakex.c
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/KERNEL.SKYLAKEX
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include $(KERNELDIR)/KERNEL.HASWELL

SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/KERNEL.ZEN
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c

STRMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
Expand Down
3 changes: 1 addition & 2 deletions kernel/x86_64/cgemm_kernel_8x2_haswell.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
#define KERNEL_2_k1m8n4 \
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
"vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
Expand Down Expand Up @@ -93,7 +93,6 @@
"movq $10,%5; movq $84,%%r15;"\
#ndim"8881:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
Expand Down
536 changes: 418 additions & 118 deletions kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c

Large diffs are not rendered by default.

424 changes: 424 additions & 0 deletions kernel/x86_64/sgemm_kernel_8x4_haswell_2.c

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions kernel/x86_64/zgemm_kernel_4x2_haswell.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
#define KERNEL_2_k1m4n4 \
"vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
"vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\
acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
Expand Down Expand Up @@ -93,9 +93,9 @@
"movq $10,%5; movq $84,%%r15;"\
#ndim"4441:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
#ndim"4442:\n\t"\
Expand Down
16 changes: 8 additions & 8 deletions param.h
Original file line number Diff line number Diff line change
Expand Up @@ -1722,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_R xgemm_r
#define XGEMM_DEFAULT_Q 128

#define CGEMM3M_DEFAULT_UNROLL_N 8
#define CGEMM3M_DEFAULT_UNROLL_M 4
#define ZGEMM3M_DEFAULT_UNROLL_N 8
#define ZGEMM3M_DEFAULT_UNROLL_M 2
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 4
#define ZGEMM3M_DEFAULT_UNROLL_M 4

#define CGEMM3M_DEFAULT_P 448
#define ZGEMM3M_DEFAULT_P 224
#define CGEMM3M_DEFAULT_P 320
#define ZGEMM3M_DEFAULT_P 256
#define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224
#define ZGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_Q 320
#define ZGEMM3M_DEFAULT_Q 256
#define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288
#define ZGEMM3M_DEFAULT_R 12288
Expand Down