Skip to content

Increase BUFFER_SIZE on arm64, make it configurable and add a guard #2539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#endif
#define HUGE_PAGESIZE ( 4 << 20)

#define BUFFER_SIZE (16 << 20)
#define BUFFER_SIZE (32 << 20)


#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
Expand Down
7 changes: 6 additions & 1 deletion common_arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,17 @@ static inline int blas_quickdivide(blasint x, blasint y){
#endif
#define HUGE_PAGESIZE ( 4 << 20)

#ifndef BUFFERSIZE
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#elif defined(TSV110) || defined(EMAG8180)
#define BUFFER_SIZE (32 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif

#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif

#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

Expand Down
12 changes: 6 additions & 6 deletions common_param.h
Original file line number Diff line number Diff line change
Expand Up @@ -1205,27 +1205,27 @@ extern gotoblas_t *gotoblas;
#endif

#ifndef SGEMM_DEFAULT_R
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15)
#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
#endif

#ifndef DGEMM_DEFAULT_R
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15)
#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
#endif

#ifndef QGEMM_DEFAULT_R
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15)
#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
#endif

#ifndef CGEMM_DEFAULT_R
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15)
#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL)
#endif

#ifndef ZGEMM_DEFAULT_R
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15)
#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL)
#endif

#ifndef XGEMM_DEFAULT_R
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15)
#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL)
#endif

#ifndef SNUMOPT
Expand Down
2 changes: 1 addition & 1 deletion common_power.h
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) || defined(POWER9)
#define BUFFER_SIZE ( 64 << 20)
#define BUFFER_SIZE ( 64 << 23)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif
Expand Down
6 changes: 6 additions & 0 deletions common_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define HUGE_PAGESIZE ( 2 << 20)

#ifndef BUFFERSIZE
#if defined(SKYLAKEX)
#define BUFFER_SIZE (32 << 21)
#elif defined(HASWELL) || defined(ZEN)
#define BUFFER_SIZE (32 << 22)
#else
#define BUFFER_SIZE (32 << 20)
#endif
#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif
Expand Down
6 changes: 1 addition & 5 deletions common_zarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#endif
#define HUGE_PAGESIZE ( 4 << 20)

#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#define BUFFER_SIZE (32 << 22)


#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
Expand Down
36 changes: 36 additions & 0 deletions driver/others/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#endif

/* Memory buffer must fit two matrix subblocks of maximal size */
#define XSTR(x) STR(x)
#define STR(x) #x
#if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \
BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \
BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2)
#error BUFFER_SIZE is too small for P, Q, and R of SGEMM:
#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_Q*4*2)
#pragma message " and " XSTR(SGEMM_DEFAULT_P*SGEMM_DEFAULT_R*4*2)
#pragma message " and " XSTR(SGEMM_DEFAULT_R*SGEMM_DEFAULT_Q*4*2)
#endif
#if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \
BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \
BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2)
#error BUFFER_SIZE is too small for P, Q, and R of DGEMM
#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_Q*4*2)
#pragma message " and " XSTR(DGEMM_DEFAULT_P*DGEMM_DEFAULT_R*4*2)
#pragma message " and " XSTR(DGEMM_DEFAULT_R*DGEMM_DEFAULT_Q*4*2)
#endif
#if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \
BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \
BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2)
#error BUFFER_SIZE is too small for P, Q, and R of CGEMM
#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_Q*4*2)
#pragma message " and " XSTR(CGEMM_DEFAULT_P*CGEMM_DEFAULT_R*4*2)
#pragma message " and " XSTR(CGEMM_DEFAULT_R*CGEMM_DEFAULT_Q*4*2)
#endif
#if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \
BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \
BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2)
#error BUFFER_SIZE is too small for P, Q, and R of ZGEMM
#pragma message "have " XSTR(BUFFER_SIZE) " need maximum of " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_Q*4*2)
#pragma message " and " XSTR(ZGEMM_DEFAULT_P*ZGEMM_DEFAULT_R*4*2)
#pragma message " and " XSTR(ZGEMM_DEFAULT_R*ZGEMM_DEFAULT_Q*4*2)
#endif

#if defined(COMPILE_TLS)

#include <errno.h>
Expand Down
14 changes: 7 additions & 7 deletions kernel/setparam-ref.c
Original file line number Diff line number Diff line change
Expand Up @@ -1291,39 +1291,39 @@ static void init_parameter(void) {
TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15);
) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15UL);

TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15);
) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15UL);

#ifdef EXPRECISION
TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15UL);
#endif

TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15);
) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15UL);

TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15UL);

TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE -
((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15);
) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15UL);

TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE -
((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA
+ TABLE_NAME.align) & ~TABLE_NAME.align)
) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15);
) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15UL);



Expand Down
22 changes: 14 additions & 8 deletions param.h
Original file line number Diff line number Diff line change
Expand Up @@ -2229,16 +2229,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2

#define SGEMM_DEFAULT_P 1280
#define DGEMM_DEFAULT_P 640
#define CGEMM_DEFAULT_P 640
#define ZGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_P 1280UL
#define DGEMM_DEFAULT_P 640UL
#define CGEMM_DEFAULT_P 640UL
#define ZGEMM_DEFAULT_P 320UL

#define SGEMM_DEFAULT_Q 640
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 640
#define ZGEMM_DEFAULT_Q 640
#define SGEMM_DEFAULT_Q 640UL
#define DGEMM_DEFAULT_Q 720UL
#define CGEMM_DEFAULT_Q 640UL
#define ZGEMM_DEFAULT_Q 640UL

#if 0
#define SGEMM_DEFAULT_R SGEMM_DEFAULT_P
#define DGEMM_DEFAULT_R DGEMM_DEFAULT_P
#define CGEMM_DEFAULT_R CGEMM_DEFAULT_P
#define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P
#endif
#define SYMV_P 8

#endif
Expand Down