Skip to content

Commit f1a18d2

Browse files
authored
Merge pull request #2618 from craft-zhang/cortex-A53
Improve performance of SGEMM and STRMM on Arm Cortex-A53
2 parents 729ac6b + 2a3aa91 commit f1a18d2

File tree

5 files changed

+5381
-2
lines changed

5 files changed

+5381
-2
lines changed

CONTRIBUTORS.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,3 +187,6 @@ In chronological order:
187187
* Marius Hillenbrand <https://github.com/mhillenibm>
188188
* [2020-05-12] Revise dynamic architecture detection for IBM z
189189
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
190+
191+
* Danfeng Zhang <https://github.com/craft-zhang>
192+
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53

kernel/arm64/KERNEL.CORTEXA53

Lines changed: 192 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,194 @@
1-
include $(KERNELDIR)/KERNEL.ARMV8
1+
SAMINKERNEL = ../arm/amin.c
2+
DAMINKERNEL = ../arm/amin.c
3+
CAMINKERNEL = ../arm/zamin.c
4+
ZAMINKERNEL = ../arm/zamin.c
25

6+
SMAXKERNEL = ../arm/max.c
7+
DMAXKERNEL = ../arm/max.c
38

9+
SMINKERNEL = ../arm/min.c
10+
DMINKERNEL = ../arm/min.c
11+
12+
ISAMINKERNEL = ../arm/iamin.c
13+
IDAMINKERNEL = ../arm/iamin.c
14+
ICAMINKERNEL = ../arm/izamin.c
15+
IZAMINKERNEL = ../arm/izamin.c
16+
17+
ISMAXKERNEL = ../arm/imax.c
18+
IDMAXKERNEL = ../arm/imax.c
19+
20+
ISMINKERNEL = ../arm/imin.c
21+
IDMINKERNEL = ../arm/imin.c
22+
23+
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
24+
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
25+
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
26+
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
27+
28+
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
29+
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
30+
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
31+
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
32+
33+
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
34+
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
35+
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
36+
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
37+
38+
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
39+
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
40+
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
41+
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
42+
43+
SAMAXKERNEL = amax.S
44+
DAMAXKERNEL = amax.S
45+
CAMAXKERNEL = zamax.S
46+
ZAMAXKERNEL = zamax.S
47+
48+
SAXPYKERNEL = axpy.S
49+
DAXPYKERNEL = axpy.S
50+
CAXPYKERNEL = zaxpy.S
51+
ZAXPYKERNEL = zaxpy.S
52+
53+
SROTKERNEL = rot.S
54+
DROTKERNEL = rot.S
55+
CROTKERNEL = zrot.S
56+
ZROTKERNEL = zrot.S
57+
58+
SSCALKERNEL = scal.S
59+
DSCALKERNEL = scal.S
60+
CSCALKERNEL = zscal.S
61+
ZSCALKERNEL = zscal.S
62+
63+
SGEMVNKERNEL = gemv_n.S
64+
DGEMVNKERNEL = gemv_n.S
65+
CGEMVNKERNEL = zgemv_n.S
66+
ZGEMVNKERNEL = zgemv_n.S
67+
68+
SGEMVTKERNEL = gemv_t.S
69+
DGEMVTKERNEL = gemv_t.S
70+
CGEMVTKERNEL = zgemv_t.S
71+
ZGEMVTKERNEL = zgemv_t.S
72+
73+
74+
SASUMKERNEL = asum.S
75+
DASUMKERNEL = asum.S
76+
CASUMKERNEL = casum.S
77+
ZASUMKERNEL = zasum.S
78+
79+
SCOPYKERNEL = copy.S
80+
DCOPYKERNEL = copy.S
81+
CCOPYKERNEL = copy.S
82+
ZCOPYKERNEL = copy.S
83+
84+
SSWAPKERNEL = swap.S
85+
DSWAPKERNEL = swap.S
86+
CSWAPKERNEL = swap.S
87+
ZSWAPKERNEL = swap.S
88+
89+
ISAMAXKERNEL = iamax.S
90+
IDAMAXKERNEL = iamax.S
91+
ICAMAXKERNEL = izamax.S
92+
IZAMAXKERNEL = izamax.S
93+
94+
SNRM2KERNEL = nrm2.S
95+
DNRM2KERNEL = nrm2.S
96+
CNRM2KERNEL = znrm2.S
97+
ZNRM2KERNEL = znrm2.S
98+
99+
DDOTKERNEL = dot.S
100+
SDOTKERNEL = dot.S
101+
CDOTKERNEL = zdot.S
102+
ZDOTKERNEL = zdot.S
103+
DSDOTKERNEL = dot.S
104+
105+
DGEMM_BETA = dgemm_beta.S
106+
SGEMM_BETA = sgemm_beta.S
107+
108+
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
109+
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
110+
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
111+
else
112+
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
113+
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
114+
endif
115+
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
116+
ifeq ($(SGEMM_UNROLL_M), 16)
117+
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
118+
else
119+
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
120+
endif
121+
ifeq ($(SGEMM_UNROLL_M), 4)
122+
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
123+
else
124+
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
125+
endif
126+
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
127+
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
128+
endif
129+
ifeq ($(SGEMM_UNROLL_N), 16)
130+
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
131+
else
132+
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
133+
endif
134+
ifeq ($(SGEMM_UNROLL_N), 4)
135+
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
136+
else
137+
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
138+
endif
139+
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
140+
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
141+
142+
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
143+
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
144+
145+
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
146+
147+
ifeq ($(DGEMM_UNROLL_M), 8)
148+
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
149+
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
150+
else
151+
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
152+
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
153+
endif
154+
155+
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
156+
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
157+
endif
158+
159+
ifeq ($(DGEMM_UNROLL_N), 4)
160+
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
161+
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
162+
else
163+
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
164+
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
165+
endif
166+
167+
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
168+
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
169+
170+
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
171+
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
172+
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
173+
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
174+
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
175+
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
176+
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
177+
endif
178+
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
179+
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
180+
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
181+
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
182+
183+
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
184+
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
185+
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
186+
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
187+
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
188+
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
189+
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
190+
endif
191+
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
192+
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
193+
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
194+
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

0 commit comments

Comments
 (0)