diff --git a/common_arm.h b/common_arm.h index 27fa76b765..8411e6dd6d 100644 --- a/common_arm.h +++ b/common_arm.h @@ -121,7 +121,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #endif #define HUGE_PAGESIZE ( 4 << 20) -#define BUFFER_SIZE (16 << 20) +#define BUFFER_SIZE (32 << 20) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_arm64.h b/common_arm64.h index a928dbe7bb..99e0cee57e 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -141,12 +141,17 @@ static inline int blas_quickdivide(blasint x, blasint y){ #endif #define HUGE_PAGESIZE ( 4 << 20) +#ifndef BUFFERSIZE #if defined(CORTEXA57) #define BUFFER_SIZE (20 << 20) +#elif defined(TSV110) || defined(EMAG8180) +#define BUFFER_SIZE (32 << 20) #else #define BUFFER_SIZE (16 << 20) #endif - +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_param.h b/common_param.h index 574d5e176d..eab14b0a6a 100644 --- a/common_param.h +++ b/common_param.h @@ -1205,27 +1205,27 @@ extern gotoblas_t *gotoblas; #endif #ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) #endif #ifndef SNUMOPT diff --git a/common_power.h b/common_power.h index e7caf9adff..14cd1629cc 100644 --- a/common_power.h +++ b/common_power.h @@ -842,7 +842,7 @@ Lmcount$lazy_ptr: #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) || defined(POWER9) -#define BUFFER_SIZE ( 64 << 20) +#define BUFFER_SIZE ( 64 << 23) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/common_x86_64.h b/common_x86_64.h index fe5539abe9..958e9caedf 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define HUGE_PAGESIZE ( 2 << 20) #ifndef BUFFERSIZE +#if defined(SKYLAKEX) +#define BUFFER_SIZE (32 << 21) +#elif defined(HASWELL) || defined(ZEN) +#define BUFFER_SIZE (32 << 22) +#else #define BUFFER_SIZE (32 << 20) +#endif #else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif diff --git a/common_zarch.h b/common_zarch.h index e105574e07..b5503a7a45 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -123,11 +123,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #endif #define HUGE_PAGESIZE ( 4 << 20) -#if defined(CORTEXA57) -#define BUFFER_SIZE (20 << 20) -#else -#define BUFFER_SIZE (16 << 20) -#endif +#define BUFFER_SIZE (32 << 22) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/driver/others/memory.c b/driver/others/memory.c index 62a5a02140..db12d3c55d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -87,6 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +/* Memory buffer must fit two matrix subblocks of maximal size */ +#define XSTR(x) STR(x) +#define STR(x) #x +#if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \ + BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of SGEMM: +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(SGEMM_DEFAULT_R*SGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of DGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(DGEMM_DEFAULT_R*DGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \ + BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of CGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(CGEMM_DEFAULT_R*CGEMM_DEFAULT_Q*4*2) +#endif +#if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \ + BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2) +#error BUFFER_SIZE is too small for P, Q, and R of ZGEMM +#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_Q*4*2) +#pragma message " and " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_R*4*2) +#pragma message " and " XSTR(ZGEMM_DEFAULT_R*ZGEMM_DEFAULT_Q*4*2) +#endif + #if defined(COMPILE_TLS) #include diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 3c71c778e1..0abc745a2c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1291,39 +1291,39 @@ static void init_parameter(void) { TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); + ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15UL); TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); + ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15UL); #ifdef EXPRECISION TABLE_NAME.qgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); + ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15UL); #endif TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); + ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15UL); TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); + ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15UL); TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); + ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15UL); TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) - ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15); + ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15UL); diff --git a/param.h b/param.h index 4103085244..388ff2d3ad 100644 --- a/param.h +++ b/param.h @@ -2229,16 +2229,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 1280 -#define DGEMM_DEFAULT_P 640 -#define CGEMM_DEFAULT_P 640 -#define ZGEMM_DEFAULT_P 320 +#define SGEMM_DEFAULT_P 1280UL +#define DGEMM_DEFAULT_P 640UL +#define CGEMM_DEFAULT_P 640UL +#define ZGEMM_DEFAULT_P 320UL -#define SGEMM_DEFAULT_Q 640 -#define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 640 -#define ZGEMM_DEFAULT_Q 640 +#define SGEMM_DEFAULT_Q 640UL +#define DGEMM_DEFAULT_Q 720UL +#define CGEMM_DEFAULT_Q 640UL +#define ZGEMM_DEFAULT_Q 640UL +#if 0 +#define SGEMM_DEFAULT_R SGEMM_DEFAULT_P +#define DGEMM_DEFAULT_R DGEMM_DEFAULT_P +#define CGEMM_DEFAULT_R CGEMM_DEFAULT_P +#define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P +#endif #define SYMV_P 8 #endif