Skip to content

Commit ea8eec5

Browse files
authored
Merge pull request #2422 from wjc404/develop
Adjust SkylakeX GEMM3M parameters, add an AVX512 STRMM kernel and fix performance bugs in AVX2 s/c/z GEMM
2 parents a9aeb67 + dd22eb7 commit ea8eec5

File tree

8 files changed

+857
-134
lines changed

8 files changed

+857
-134
lines changed

kernel/x86_64/KERNEL.HASWELL

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ CAXPYKERNEL = caxpy.c
3232
ZAXPYKERNEL = zaxpy.c
3333

3434
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
35-
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
35+
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
3636
SGEMM_BETA = sgemm_beta_skylakex.c
3737
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
3838
SGEMMITCOPY = ../generic/gemm_tcopy_8.c

kernel/x86_64/KERNEL.SKYLAKEX

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include $(KERNELDIR)/KERNEL.HASWELL
22

33
SGEMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
4-
STRMMKERNEL = sgemm_kernel_16x4_haswell.S
4+
STRMMKERNEL = sgemm_kernel_16x4_skylakex_2.c
55
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
66
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
77
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c

kernel/x86_64/KERNEL.ZEN

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ CAXPYKERNEL = caxpy.c
3131
ZAXPYKERNEL = zaxpy.c
3232

3333
STRMMKERNEL = sgemm_kernel_8x4_haswell.c
34-
SGEMMKERNEL = sgemm_kernel_8x4_haswell.c
34+
SGEMMKERNEL = sgemm_kernel_8x4_haswell_2.c
3535
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
3636
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
3737
SGEMMONCOPY = ../generic/gemm_ncopy_4.c

kernel/x86_64/cgemm_kernel_8x2_haswell.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
5151
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
5252
#define KERNEL_2_k1m8n4 \
53-
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
53+
"vpermilps $177,-64(%0),%%ymm0; vpermilps $177,-32(%0),%%ymm1;"\
5454
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
5555
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
5656
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
@@ -93,7 +93,6 @@
9393
"movq $10,%5; movq $84,%%r15;"\
9494
#ndim"8881:\n\t"\
9595
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
96-
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
9796
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
9897
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
9998
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\

kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c

Lines changed: 418 additions & 118 deletions
Large diffs are not rendered by default.

kernel/x86_64/sgemm_kernel_8x4_haswell_2.c

Lines changed: 424 additions & 0 deletions
Large diffs are not rendered by default.

kernel/x86_64/zgemm_kernel_4x2_haswell.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
"vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
5151
acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
5252
#define KERNEL_2_k1m4n4 \
53-
"vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
53+
"vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\
5454
acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
5555
#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
5656
#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
@@ -93,9 +93,9 @@
9393
"movq $10,%5; movq $84,%%r15;"\
9494
#ndim"4441:\n\t"\
9595
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
96-
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
96+
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
9797
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
98-
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
98+
KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
9999
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
100100
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
101101
#ndim"4442:\n\t"\

param.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1722,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
17221722
#define XGEMM_DEFAULT_R xgemm_r
17231723
#define XGEMM_DEFAULT_Q 128
17241724

1725-
#define CGEMM3M_DEFAULT_UNROLL_N 8
1726-
#define CGEMM3M_DEFAULT_UNROLL_M 4
1727-
#define ZGEMM3M_DEFAULT_UNROLL_N 8
1728-
#define ZGEMM3M_DEFAULT_UNROLL_M 2
1725+
#define CGEMM3M_DEFAULT_UNROLL_N 4
1726+
#define CGEMM3M_DEFAULT_UNROLL_M 8
1727+
#define ZGEMM3M_DEFAULT_UNROLL_N 4
1728+
#define ZGEMM3M_DEFAULT_UNROLL_M 4
17291729

1730-
#define CGEMM3M_DEFAULT_P 448
1731-
#define ZGEMM3M_DEFAULT_P 224
1730+
#define CGEMM3M_DEFAULT_P 320
1731+
#define ZGEMM3M_DEFAULT_P 256
17321732
#define XGEMM3M_DEFAULT_P 112
1733-
#define CGEMM3M_DEFAULT_Q 224
1734-
#define ZGEMM3M_DEFAULT_Q 224
1733+
#define CGEMM3M_DEFAULT_Q 320
1734+
#define ZGEMM3M_DEFAULT_Q 256
17351735
#define XGEMM3M_DEFAULT_Q 224
17361736
#define CGEMM3M_DEFAULT_R 12288
17371737
#define ZGEMM3M_DEFAULT_R 12288

0 commit comments

Comments
 (0)