diff --git a/Makefile.arm64 b/Makefile.arm64 index b98933b77a..3d713b853a 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -303,6 +303,11 @@ FCOMMON_OPT += -march=armv8.3-a endif endif +ifeq ($(CORE), VORTEXM4) +CCOMMON_OPT += -march=armv8.4-a+sme +FCOMMON_OPT += -march=armv8.4-a+sme +endif + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.system b/Makefile.system index 3f1c48d23d..cd9213365e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -427,7 +427,7 @@ ifndef MACOSX_DEPLOYMENT_TARGET ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 export NO_SVE = 1 -export NO_SME = 1 +# export NO_SME = 1 else export MACOSX_DEPLOYMENT_TARGET=10.8 endif @@ -723,6 +723,7 @@ DYNAMIC_CORE += A64FX endif ifneq ($(NO_SME), 1) DYNAMIC_CORE += ARMV9SME +DYNAMIC_CORE += VORTEXM4 endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 diff --git a/TargetList.txt b/TargetList.txt index b890c1440d..4903261e2c 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -111,6 +111,7 @@ THUNDERX2T99 TSV110 THUNDERX3T110 VORTEX +VORTEXM4 A64FX ARMV8SVE ARMV9SME diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d9a7aafd62..a312ef814e 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -39,14 +39,14 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) endif () if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 - set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4) endif() elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) endif () - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 - set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19 OR (${CMAKE_C_COMPILER_ID} MATCHES AppleClang AND ${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 17) ) # SME ACLE supported in LLVM >= 19 and AppleClang >= 17 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME VORTEXM4) endif() endif () if (DYNAMIC_LIST) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 952b2dd7ad..10ad9388d6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -315,6 +315,16 @@ if (${CORE} STREQUAL ARMV9SME) endif () endif () +if (${CORE} STREQUAL VORTEXM4) + if (NOT DYNAMIC_ARCH) + if (${CMAKE_C_COMPILER_ID} STREQUAL "NVC" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -tp=host") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sme") + endif () + endif () +endif () + if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index eea5eb5ede..8b0fda8178 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1252,7 +1252,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "VORTEX") + elseif ("${TCORE}" STREQUAL "VORTEX" OR "${TCORE}" STREQUAL "VORTEXM4") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" "#define L1_CODE_SIZE\t32768\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index bf4c548b92..7ce5b82abb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -361,6 +361,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) if (${TARGET} STREQUAL ARMV9SME) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme -O3") endif() + if (${TARGET} STREQUAL VORTEXM4) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sme -O3") + endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index dd0dfab637..04e07cc135 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -142,7 +142,7 @@ endif() if (ARM64) if (NOT NO_SME) file(WRITE ${PROJECT_BINARY_DIR}/sme.c ".text \n.global sme_test\n\nsme_test:\nsmstart\nsmstop\nret\n") - execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv9-a+sve2+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) + execute_process(COMMAND ${CMAKE_C_COMPILER} -march=armv8.4-a+sme -c -v -o ${PROJECT_BINARY_DIR}/sme.o ${PROJECT_BINARY_DIR}/sme.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_SME) if (NO_SME EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_SME") endif() diff --git a/common_param.h b/common_param.h index 0145f667a1..d8298a0057 100644 --- a/common_param.h +++ b/common_param.h @@ -257,6 +257,7 @@ int (*shgemm_otcopy )(BLASLONG, BLASLONG, hfloat16 *, BLASLONG, hfloat16 *); #ifdef ARCH_ARM64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); void (*sgemm_direct_alpha_beta) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float, float * , BLASLONG); + int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif diff --git a/common_s.h b/common_s.h index 88b4732f51..513906ad50 100644 --- a/common_s.h +++ b/common_s.h @@ -217,7 +217,7 @@ #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct #elif ARCH_ARM64 -#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant #define SGEMM_DIRECT gotoblas -> sgemm_direct #define SGEMM_DIRECT_ALPHA_BETA gotoblas -> sgemm_direct_alpha_beta #endif diff --git a/cpuid_arm64.c b/cpuid_arm64.c index fd6a9bd478..0f1a125745 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -82,6 +82,7 @@ size_t length64=sizeof(value64); #define CPU_AMPERE1 25 // Apple #define CPU_VORTEX 13 +#define CPU_VORTEXM4 26 // Fujitsu #define CPU_A64FX 15 // Phytium @@ -113,7 +114,8 @@ static char *cpuname[] = { "FT2000", "CORTEXA76", "NEOVERSEV2", - "AMPERE1" + "AMPERE1", + "VORTEXM4", }; static char *cpuname_lower[] = { @@ -143,7 +145,7 @@ static char *cpuname_lower[] = { "cortexa76", "neoversev2", "ampere1", - "ampere1a" + "vortexm4" }; static int cpulowperf=0; @@ -400,7 +402,7 @@ int detect(void) if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 - if (value64 == 1867590060) return CPU_VORTEX; //M4 + if (value64 == 1867590060) return CPU_VORTEXM4; //M4 #else #ifdef OS_WINDOWS HKEY reghandle; @@ -740,6 +742,27 @@ void get_cpuconfig(void) length64 = sizeof(value64); sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); printf("#define L2_SIZE %lld \n",value64); +#endif + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; + case CPU_VORTEXM4: + printf("#define VORTEXM4 \n"); + printf("#define HAVE_SME 1 \n"); +#ifdef __APPLE__ + length64 = sizeof(value64); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + printf("#define L1_DATA_LINESIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + length64 = sizeof(value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); #endif printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 70b51f6fce..0202a7368b 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -128,6 +128,12 @@ extern gotoblas_t gotoblas_ARMV9SME; #else #define gotoblas_ARMV9SME gotoblas_ARMV8 #endif +#ifdef DYN_VORTEXM4 +extern gotoblas_t gotoblas_VORTEXM4; +#else +#error "dont have vortexm4" +#define gotoblas_VORTEXM4 gotoblas_ARMV8 +#endif #ifdef DYN_CORTEXA55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -155,17 +161,22 @@ extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_ARMV8SVE; extern gotoblas_t gotoblas_A64FX; -#ifndef NO_SME -extern gotoblas_t gotoblas_ARMV9SME; -#else -#define gotoblas_ARMV9SME gotoblas_ARMV8SVE -#endif #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 -#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +extern gotoblas_t gotoblas_VORTEXM4; +#else +#ifndef NO_SVE +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#else +#define gotoblas_ARMV9SME gotoblas_NEOVERSEN1 +#endif +#define gotoblas_VORTEXM4 gotoblas_NEOVERSEN1 #endif extern gotoblas_t gotoblas_THUNDERX3T110; @@ -176,7 +187,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 19 +#define NUM_CORETYPES 20 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -216,6 +227,7 @@ static char *corename[] = { "armv8sve", "a64fx", "armv9sme", + "vortexm4", "unknown" }; @@ -239,6 +251,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; if (gotoblas == &gotoblas_A64FX) return corename[17]; if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; + if (gotoblas == &gotoblas_VORTEXM4) return corename[19]; return corename[NUM_CORETYPES]; } @@ -277,6 +290,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 16: return (&gotoblas_ARMV8SVE); case 17: return (&gotoblas_A64FX); case 18: return (&gotoblas_ARMV9SME); + case 19: return (&gotoblas_VORTEXM4); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -288,11 +302,11 @@ static gotoblas_t *get_coretype(void) { char coremsg[128]; #if defined (OS_DARWIN) -//future #if !defined(NO_SME) -// if (support_sme1()) { -// return &gotoblas_ARMV9SME; -// } -// #endif +#if !defined(NO_SME) + if (support_sme1()) { + return &gotoblas_VORTEXM4; + } +#endif return &gotoblas_NEOVERSEN1; #endif @@ -463,7 +477,7 @@ static gotoblas_t *get_coretype(void) { } break; case 0x61: // Apple -//future if (support_sme1()) return &gotoblas_ARMV9SME; + if (support_sme1()) return &gotoblas_VORTEXM4; return &gotoblas_NEOVERSEN1; break; default: diff --git a/getarch.c b/getarch.c index 417a3d08ca..72097ada97 100644 --- a/getarch.c +++ b/getarch.c @@ -1654,6 +1654,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "VORTEX" #endif +#ifdef FORCE_VORTEXM4 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "VORTEXM4" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DVORTEXM4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SME -DARMV8" +#define LIBNAME "vortexm4" +#define CORENAME "VORTEXM4" +#endif + #ifdef FORCE_A64FX #define ARMV8 #define FORCE diff --git a/interface/gemm.c b/interface/gemm.c index c5182c266a..52c16fc0ab 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -266,6 +266,7 @@ void NAME(char *TRANSA, char *TRANSB, int transa, transb, nrowa, nrowb; blasint info; + int order = -1; char transA, transB; IFLOAT *buffer; @@ -424,30 +425,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) -#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) -#if defined(DYNAMIC_ARCH) - if (support_avx512() ) -#endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { - SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); - return; - } -#endif -#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) -#if defined(DYNAMIC_ARCH) - if (support_sme1()) -#endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { - SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); - return; - }else if (order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans) { - SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - return; - } -#endif -#endif - #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β @@ -564,6 +541,36 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS return; } + + if ((args.m == 0) || (args.n == 0)) return; +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && !defined(HFLOAT16) +#if defined(ARCH_x86) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) + if (support_avx512() ) +#endif + if (order == CblasRowMajor && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } +#endif +#if defined(ARCH_ARM64) && (defined(USE_SGEMM_KERNEL_DIRECT)||defined(DYNAMIC_ARCH)) +#if defined(DYNAMIC_ARCH) +if (strcmp(gotoblas_corename(), "armv9sme") == 0 || strcmp(gotoblas_corename(), "vortexm4") == 0) +// if (support_sme1()) +#endif + if (order == CblasRowMajor && m==lda && n ==ldb && k==ldc && beta == 0 && alpha == 1.0 && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); + return; + } +else + if (order == CblasRowMajor && m==lda && n==ldb && k==ldc && TransA == CblasNoTrans && TransB == CblasNoTrans&& SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT_ALPHA_BETA(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return; + } + +#endif +#endif + #endif #if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index a2e349d32d..b4ea62f0d5 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -241,7 +241,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (X86_64 OR ARM64) set(USE_DIRECT_SGEMM true) endif() - if (UC_TARGET_CORE MATCHES ARMV9SME) + if (UC_TARGET_CORE MATCHES ARMV9SME OR UC_TARGET_CORE MATCHES VORTEXM4) set (HAVE_SME true) endif () @@ -254,14 +254,16 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) elseif (ARM64) + set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) set (SGEMMDIRECTKERNEL sgemm_direct_arm64_sme1.c) set (SGEMMDIRECTKERNEL_ALPHA_BETA sgemm_direct_alpha_beta_arm64_sme1.c) - set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1.S) + set (SGEMMDIRECTSMEKERNEL sgemm_direct_sme1_2VLx2VL.S) set (SGEMMDIRECTPREKERNEL sgemm_direct_sme1_preprocess.S) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL_ALPHA_BETA}" "" "gemm_direct_alpha_beta" false "" "" false SINGLE) if (HAVE_SME) - GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTSMEKERNEL}" "" "gemm_direct_sme1_2VLx2VL" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPREKERNEL}" "" "gemm_direct_sme1_preprocess" false "" "" false SINGLE) endif () endif () diff --git a/kernel/Makefile b/kernel/Makefile index 84cd482a06..7b96244ece 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -29,6 +29,9 @@ ifdef TARGET_CORE ifeq ($(TARGET_CORE), ARMV9SME) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv9-a+sve2+sme endif +ifeq ($(TARGET_CORE), VORTEXM4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -DHAVE_SME -march=armv8.4-a+sme +endif ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 79c88d76c7..b117bf1100 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -131,8 +131,12 @@ ifeq ($(ARCH), arm64) ifeq ($(TARGET_CORE), ARMV9SME) HAVE_SME = 1 endif +ifeq ($(TARGET_CORE), VORTEXM4) +HAVE_SME = 1 +endif SGEMMDIRECTKERNEL = sgemm_direct_arm64_sme1.c SGEMMDIRECTKERNEL_ALPHA_BETA = sgemm_direct_alpha_beta_arm64_sme1.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif endif endif @@ -209,11 +213,12 @@ SKERNELOBJS += \ endif ifeq ($(ARCH), arm64) SKERNELOBJS += \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) \ sgemm_direct$(TSUFFIX).$(SUFFIX) \ sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) ifdef HAVE_SME SKERNELOBJS += \ - sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) \ sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) endif endif @@ -969,13 +974,15 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif ifeq ($(ARCH), arm64) +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)sgemm_direct_alpha_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL_ALPHA_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifdef HAVE_SME -$(KDIR)sgemm_direct_sme1$(TSUFFIX).$(SUFFIX) : - $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1.S -UDOUBLE -UCOMPLEX -o $@ +$(KDIR)sgemm_direct_sme1_2VLx2VL$(TSUFFIX).$(SUFFIX) : + $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_2VLx2VL.S -UDOUBLE -UCOMPLEX -o $@ $(KDIR)sgemm_direct_sme1_preprocess$(TSUFFIX).$(SUFFIX) : $(CC) $(CFLAGS) -c $(KERNELDIR)/sgemm_direct_sme1_preprocess.S -UDOUBLE -UCOMPLEX -o $@ endif diff --git a/kernel/arm64/KERNEL.VORTEXM4 b/kernel/arm64/KERNEL.VORTEXM4 new file mode 100644 index 0000000000..46a34469c3 --- /dev/null +++ b/kernel/arm64/KERNEL.VORTEXM4 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.NEOVERSEN1 diff --git a/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c b/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c index d9de3ace3f..f2de509c77 100644 --- a/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_alpha_beta_arm64_sme1.c @@ -14,9 +14,17 @@ #include #endif +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) +#else +#define SME1_PREPROCESS sgemm_direct_sme1_preprocess +#endif /* Function prototypes */ -extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ - const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); +extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod); /* Function Definitions */ static uint64_t sve_cntw() { @@ -99,10 +107,11 @@ kernel_2x2(const float *A, const float *B, float *C, size_t shared_dim, svst1_hor_za32(/*tile*/2, /*slice*/i, pg_c_0, &C[i * ldc]); svst1_hor_za32(/*tile*/3, /*slice*/i, pg_c_1, &C[i * ldc + svl]); } +return; } __arm_new("za") __arm_locally_streaming -void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ +static void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ const float *ba, const float *restrict bb, const float* beta,\ float *restrict C) { @@ -125,6 +134,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co // Block over row dimension of C for (; row_idx < num_rows; row_idx += row_batch) { row_batch = MIN(row_batch, num_rows - row_idx); + uint64_t col_idx = 0; uint64_t col_batch = 2*svl; @@ -143,7 +153,7 @@ void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, co #else void sgemm_direct_alpha_beta_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n, const float* alpha,\ const float *ba, const float *restrict bb, const float* beta,\ - float *restrict C){} + float *restrict C){fprintf(stderr,"empty sgemm_alpha_beta2x2 should never get called!!!\n");} #endif /*void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K,\ @@ -166,25 +176,27 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict * of reading directly from vector (z) registers. * */ asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", - "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31","za"); /* Pre-process the left matrix to make it suitable for matrix sum of outer-product calculation */ - sgemm_direct_sme1_preprocess(M, K, A, A_mod); + + SME1_PREPROCESS(M, K, A, A_mod); asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15","d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", - "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "za"); /* Calculate C = alpha*A*B + beta*C */ + sgemm_direct_alpha_beta_sme1_2VLx2VL(M, K, N, &alpha, A_mod, B, &beta, R); free(A_mod); @@ -194,6 +206,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float alpha, float * __restrict A,\ BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ - float beta, float * __restrict R, BLASLONG strideR){} + float beta, float * __restrict R, BLASLONG strideR){fprintf(stderr,"empty sgemm_direct_alpha_beta should not be called!!!\n");} #endif diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c index 13c337a13e..8c1b398186 100644 --- a/kernel/arm64/sgemm_direct_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -8,17 +8,28 @@ #include #include #if defined(HAVE_SME) - +#if defined(DYNAMIC_ARCH) +#define COMBINE(a,b) a ## b +#define COMBINE2(a,b) COMBINE(a,b) +#define SME1_PREPROCESS_BASE sgemm_direct_sme1_preprocess +#define SME1_PREPROCESS COMBINE2(SME1_PREPROCESS_BASE,TS) +#define SME1_DIRECT2X2_BASE sgemm_direct_sme1_2VLx2VL +#define SME1_DIRECT2X2 COMBINE2(SME1_DIRECT2X2_BASE,TS) +#else +#define SME1_PREPROCESS sgemm_direct_sme1_preprocess +#define SME1_DIRECT2X2 sgemm_direct_sme1_2VLx2VL +#endif /* Function prototypes */ -extern void sgemm_direct_sme1_preprocess(uint64_t nbr, uint64_t nbc,\ - const float * restrict a, float * a_mod) __asm__("sgemm_direct_sme1_preprocess"); -extern void sgemm_direct_sme1_2VLx2VL(uint64_t m, uint64_t k, uint64_t n,\ +extern void SME1_PREPROCESS(uint64_t nbr, uint64_t nbc,\ + const float * restrict a, float * a_mod) ; + +extern void SME1_DIRECT2X2(uint64_t m, uint64_t k, uint64_t n,\ const float * matLeft,\ const float * restrict matRight,\ - const float * restrict matResult) __asm__("sgemm_direct_sme1_2VLx2VL"); + const float * restrict matResult) ; /* Function Definitions */ -uint64_t sve_cntw() { +static uint64_t sve_cntw() { uint64_t cnt; asm volatile( "rdsvl %[res], #1\n" @@ -39,7 +50,6 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ uint64_t m_mod, vl_elms; vl_elms = sve_cntw(); - m_mod = ceil((double)M/(double)vl_elms) * vl_elms; float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); @@ -48,7 +58,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ * of reading directly from vector (z) registers. * */ asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", @@ -57,13 +67,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ /* Pre-process the left matrix to make it suitable for matrix sum of outer-product calculation */ - sgemm_direct_sme1_preprocess(M, K, A, A_mod); + SME1_PREPROCESS(M, K, A, A_mod); /* Calculate C = A*B */ - sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); + SME1_DIRECT2X2(M, K, N, A_mod, B, R); asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", - "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", @@ -75,6 +85,8 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ BLASLONG strideA, float * __restrict B, BLASLONG strideB ,\ - float * __restrict R, BLASLONG strideR){} - + float * __restrict R, BLASLONG strideR){ +fprintf(stderr,"EMPTY sgemm_kernel_direct should never be called \n"); +} #endif + diff --git a/kernel/arm64/sgemm_direct_performant.c b/kernel/arm64/sgemm_direct_performant.c new file mode 100644 index 0000000000..c5c42bccca --- /dev/null +++ b/kernel/arm64/sgemm_direct_performant.c @@ -0,0 +1,31 @@ +#include "common.h" +/* helper for the direct sgemm code written by Arjan van der Ven */ + + + + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ +if (M<3 || M%2==1) return 0; + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + diff --git a/kernel/arm64/sgemm_direct_sme1.S b/kernel/arm64/sgemm_direct_sme1_2VLx2VL.S similarity index 95% rename from kernel/arm64/sgemm_direct_sme1.S rename to kernel/arm64/sgemm_direct_sme1_2VLx2VL.S index 8c0a173f3d..afb662c1fb 100644 --- a/kernel/arm64/sgemm_direct_sme1.S +++ b/kernel/arm64/sgemm_direct_sme1_2VLx2VL.S @@ -35,16 +35,17 @@ #define K_exit x15 //Exit condition for K loop #define M_cntr x16 //M loop counter #define C1 x17 //Constant1: N*(SVLs+1);SVLs-No. of 32-bit elements -#define C2 x18 //Constant2: N + SVLs -#define C3 x19 //Constant3: K*SVLs + SVLs -#define C4 x20 //Constant4: SVLs-2 -#define C5 x21 //Constant5: K*SVLs -#define C6 x22 //Constant6: N*SVLs +#define C2 x19 //Constant2: N + SVLs +#define C3 x20 //Constant3: K*SVLs + SVLs +#define C4 x21 //Constant4: SVLs-2 +#define C5 x22 //Constant5: K*SVLs +#define C6 x23 //Constant6: N*SVLs .text - .global sgemm_direct_sme1_2VLx2VL + .global ASMNAME - sgemm_direct_sme1_2VLx2VL: + ASMNAME: + //sgemm_direct_sme1_2VLx2VL: stp x19, x20, [sp, #-48]! stp x21, x22, [sp, #16] @@ -61,7 +62,7 @@ add C2, N, C4 //N + SVLs add C3, C5, C4 //K*SVLs + SVLs whilelt p2.s, M_cntr, M //Tile 0,1 predicate (M dimension) - sub w20, w20, #2 //SVLs-2 + sub w21, w21, #2 //SVLs-2 .M_Loop: incw M_cntr @@ -198,7 +199,7 @@ process_K_less_than_equal_2: st1w {za1h.s[w13, #0]}, p5, [Cptr1] st1w {za2h.s[w13, #0]}, p6, [Cptr0, C6, lsl #2] st1w {za3h.s[w13, #0]}, p7, [Cptr1, C6, lsl #2] - cmp w13, w20 + cmp w13, w21 b.mi .Loop_store_ZA psel p4, p0, p2.s[w13, 1] psel p5, p1, p2.s[w13, 1] @@ -211,12 +212,12 @@ process_K_less_than_equal_2: addvl Cptr, Cptr, #2 addvl Bptr, Bptr, #1 whilelt p0.b, Bptr, N_exit //1st Tile predicate (N dimension) - b.first .N_Loop + b.mi .N_Loop add A_base, A_base, C5, lsl #3 //A_base += 2*K*SVLs FP32 elements add C_base, C_base, C6, lsl #3 //C_base += 2*N*SVLs FP32 elements incw M_cntr whilelt p2.s, M_cntr, M //1st Tile predicate (M dimension) - b.first .M_Loop + b.mi .M_Loop smstop diff --git a/kernel/arm64/sgemm_direct_sme1_preprocess.S b/kernel/arm64/sgemm_direct_sme1_preprocess.S index fa13620751..6c51b0bf63 100644 --- a/kernel/arm64/sgemm_direct_sme1_preprocess.S +++ b/kernel/arm64/sgemm_direct_sme1_preprocess.S @@ -37,9 +37,9 @@ #define C6 x15 //Constant6: 3*ncol .text - .global sgemm_direct_sme1_preprocess + .global ASMNAME //sgemm_direct_sme1_preprocess - sgemm_direct_sme1_preprocess: + ASMNAME: //sgemm_direct_sme1_preprocess: stp x19, x20, [sp, #-48]! stp x21, x22, [sp, #16] @@ -114,14 +114,14 @@ addvl mat_ptr0, mat_ptr0, #1 //mat_ptr0 += SVLb whilelt p8.b, mat_ptr0, inner_loop_exit - b.first .Loop_process + b.mi .Loop_process add mat_mod, mat_mod, C3, lsl #2 //mat_mod+=SVLs*nbc FP32 elements add mat, mat, C3, lsl #2 //mat+=SVLs*nbc FP32 elements incw outer_loop_cntr whilelt p0.s, outer_loop_cntr, nrow - b.first .M_Loop + b.mi .M_Loop smstop diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index ccfbab8c11..36ebdd22ac 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -216,6 +216,7 @@ gotoblas_t TABLE_NAME = { #ifdef ARCH_ARM64 sgemm_directTS, sgemm_direct_alpha_betaTS, + sgemm_direct_performantTS, #endif sgemm_kernelTS, sgemm_betaTS, diff --git a/param.h b/param.h index d0ee246e83..5a807d7f6d 100644 --- a/param.h +++ b/param.h @@ -3353,7 +3353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA57) || defined(CORTEXX1) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) || defined(FT2000) || defined(VORTEXM4) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3370,7 +3370,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX)|| defined(CORTEXX1) || defined(VORTEXM4) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -3598,15 +3598,15 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #undef BGEMM_ALIGN_K #undef BGEMM_DEFAULT_UNROLL_M #undef BGEMM_DEFAULT_UNROLL_N -#define BGEMM_ALIGN_K 4 -#define BGEMM_DEFAULT_UNROLL_M 8 +#define BGEMM_ALIGN_K 8 #define BGEMM_DEFAULT_UNROLL_N 4 +#define BGEMM_DEFAULT_UNROLL_M 4 #undef SBGEMM_ALIGN_K #undef SBGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_N -#define SBGEMM_ALIGN_K 4 -#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_ALIGN_K 8 +#define SBGEMM_DEFAULT_UNROLL_M 4 #define SBGEMM_DEFAULT_UNROLL_N 4 #define SGEMM_DEFAULT_UNROLL_M 16 @@ -3842,7 +3842,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #endif /* ARMv8 */ -#if defined(ARMV9SME) /* ARMv9 SME */ +#if defined(ARMV9SME) || defined(VORTEXM4) /* ARMv9 SME */ #define USE_SGEMM_KERNEL_DIRECT 1 #endif /* ARMv9 SME */