From 860dcfc7037bdb022083c4cda39da8f72628f8bf Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 3 Oct 2017 13:43:39 -0700 Subject: [PATCH 1/3] Use 2D thread distribution for small GEMMs. Allows maximum use of available cores if one of M and N is small and the other is large. --- driver/level3/level3_thread.c | 96 +++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 8ab4ef699f..0fd39eea79 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -219,12 +219,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda, ldb, ldc; - BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; + BLASLONG m_from, m_to, n_from, n_to; FLOAT *alpha, *beta; FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; + BLASLONG nthreads_m; BLASLONG xxx, bufferside; + BLASLONG mypos_m, mypos_n; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; @@ -259,26 +261,28 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; + nthreads_m = args -> nthreads; + if (range_m) { + nthreads_m = range_m[-1]; + } + + mypos_m = mypos % nthreads_m; + mypos_n = mypos / nthreads_m; + m_from = 0; m_to = M; if (range_m) { - m_from = range_m[0]; - m_to = range_m[1]; + m_from = range_m[mypos_m + 0]; + m_to = range_m[mypos_m + 1]; } n_from = 0; n_to = N; - N_from = 0; - N_to = N; - if (range_n) { n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; - - N_from = range_n[0]; - N_to = range_n[args -> nthreads]; } if (beta) { @@ -287,7 +291,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if ((beta[0] != ONE) || (beta[1] != ZERO)) #endif - BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); + BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc); } if ((k == 0) || (alpha == NULL)) return 0; @@ -299,8 +303,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, ) return 0; #if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", - mypos, m_from, m_to, n_from, n_to, N_from, N_to); + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", + mypos, m_from, m_to, n_from, n_to); fprintf(stderr, "GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); @@ -394,7 +398,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } #endif - for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; WMB; } @@ -402,13 +407,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, do { current ++; - if (current >= args -> nthreads) current = 0; + if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - if (current != mypos) { + if (current != mypos) { START_RPCC(); @@ -479,7 +484,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } current ++; - if (current >= args -> nthreads) current = 0; + if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; } while (current != mypos); @@ -525,7 +530,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG - *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + *range_n, FLOAT *sa, FLOAT *sb, + BLASLONG nthreads_m, BLASLONG nthreads_n) { blas_arg_t newarg; @@ -537,8 +543,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG blas_queue_t queue[MAX_CPU_NUMBER]; - BLASLONG range_M[MAX_CPU_NUMBER + 1]; - BLASLONG range_N[MAX_CPU_NUMBER + 1]; + BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2]; + BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2]; + BLASLONG *range_M = range_M_buffer + 1; + BLASLONG *range_N = range_N_buffer + 1; BLASLONG num_cpu_m, num_cpu_n; @@ -595,6 +603,9 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG newarg.gemm_r = args -> gemm_r; #endif + range_M[-1] = nthreads_m; + range_N[-1] = nthreads_n; + if (!range_m) { range_M[0] = 0; m = args -> m; @@ -607,7 +618,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG while (m > 0){ - width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); + width = blas_quickdivide(m + nthreads_m - num_cpu_m - 1, nthreads_m - num_cpu_m); m -= width; if (m < 0) width = width + m; @@ -617,12 +628,16 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_cpu_m ++; } - for (i = 0; i < num_cpu_m; i++) { + for (i = num_cpu_m; i < MAX_CPU_NUMBER; i++) { + range_M[i + 1] = range_M[num_cpu_m]; + } + + for (i = 0; i < nthreads; i++) { queue[i].mode = mode; queue[i].routine = inner_thread; queue[i].args = &newarg; - queue[i].range_m = &range_M[i]; - queue[i].range_n = &range_N[0]; + queue[i].range_m = range_M; + queue[i].range_n = range_N; queue[i].sa = NULL; queue[i].sb = NULL; queue[i].next = &queue[i + 1]; @@ -659,17 +674,21 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_cpu_n ++; } - for (j = 0; j < num_cpu_m; j++) { - for (i = 0; i < num_cpu_m; i++) { + for (j = num_cpu_n; j < MAX_CPU_NUMBER; j++) { + range_N[j + 1] = range_N[num_cpu_n]; + } + + for (j = 0; j < MAX_CPU_NUMBER; j++) { + for (i = 0; i < MAX_CPU_NUMBER; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } - queue[num_cpu_m - 1].next = NULL; + queue[nthreads - 1].next = NULL; - exec_blas(num_cpu_m, queue); + exec_blas(nthreads, queue); } #ifdef USE_ALLOC_HEAP @@ -684,6 +703,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG nthreads = args -> nthreads; + BLASLONG nthreads_m, nthreads_n; if (nthreads == 1) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); @@ -704,21 +724,31 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO n = n_to - n_from; } - if ((m < 2 * SWITCH_RATIO) || (n < 2 * SWITCH_RATIO)) { + nthreads_m = nthreads; + while (m < nthreads_m * SWITCH_RATIO) { + nthreads_m = nthreads_m / 2; + } + + if (nthreads_m < 1) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } - if (m < nthreads * SWITCH_RATIO) { - nthreads = blas_quickdivide(m, SWITCH_RATIO); + nthreads_n = nthreads / nthreads_m; + if (n < nthreads_m * (nthreads_n - 1)) { + nthreads_n = (n + nthreads_m - 1) / nthreads_m; } - if (n < nthreads * SWITCH_RATIO) { - nthreads = blas_quickdivide(n, SWITCH_RATIO); + + nthreads = nthreads_m * nthreads_n; + + if (nthreads <= 1) { + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; } args -> nthreads = nthreads; - gemm_driver(args, range_m, range_n, sa, sb, 0); + gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n); return 0; } From 9de52b489a5cf5214106ade8fd86d8b854d16a9c Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Tue, 3 Oct 2017 16:32:08 -0700 Subject: [PATCH 2/3] Cleaning up and documenting multi-threaded GEMM code. --- driver/level3/level3_thread.c | 341 +++++++++++++++------------------- 1 file changed, 151 insertions(+), 190 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 0fd39eea79..22a12d465a 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -97,21 +97,21 @@ typedef struct { #ifndef BETA_OPERATION #ifndef COMPLEX -#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ - GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ - BETA[0], NULL, 0, NULL, 0, \ - (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #else -#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ - GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ - BETA[0], BETA[1], NULL, 0, NULL, 0, \ - (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) #endif #endif #ifndef ICOPY_OPERATION #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ - defined(RN) || defined(RT) || defined(RC) || defined(RR) + defined(RN) || defined(RT) || defined(RC) || defined(RR) #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -120,7 +120,7 @@ typedef struct { #ifndef OCOPY_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ - defined(NR) || defined(TR) || defined(CR) || defined(RR) + defined(NR) || defined(TR) || defined(CR) || defined(RR) #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); #else #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); @@ -144,36 +144,36 @@ typedef struct { #ifndef KERNEL_OPERATION #ifndef COMPLEX -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #else -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) #endif #endif #ifndef FUSED_KERNEL_OPERATION #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ - defined(NR) || defined(TR) || defined(CR) || defined(RR) + defined(NR) || defined(TR) || defined(CR) || defined(RR) #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ - FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ - (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ - FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ - (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #else #ifndef COMPLEX #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ - FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ - (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #else #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ - FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ - (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) #endif #endif #endif @@ -224,12 +224,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *alpha, *beta; FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; + BLASLONG nthreads_m; - BLASLONG xxx, bufferside; BLASLONG mypos_m, mypos_n; - BLASLONG ls, min_l, jjs, min_jj; - BLASLONG is, min_i, div_n; + BLASLONG is, js, ls, bufferside, jjs; + BLASLONG min_i, min_l, div_n, min_jj; BLASLONG i, current; BLASLONG l1stride; @@ -261,30 +261,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, alpha = (FLOAT *)args -> alpha; beta = (FLOAT *)args -> beta; + /* Initialize 2D CPU distribution */ nthreads_m = args -> nthreads; if (range_m) { nthreads_m = range_m[-1]; } + mypos_n = blas_quickdivide(mypos, nthreads_m); /* mypos_n = mypos / nthreads_m */ + mypos_m = mypos - mypos_n * nthreads_m; /* mypos_m = mypos % nthreads_m */ - mypos_m = mypos % nthreads_m; - mypos_n = mypos / nthreads_m; - + /* Initialize m and n */ m_from = 0; m_to = M; - if (range_m) { m_from = range_m[mypos_m + 0]; m_to = range_m[mypos_m + 1]; } - n_from = 0; n_to = N; - if (range_n) { n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; } + /* Multiply C by beta if needed */ if (beta) { #ifndef COMPLEX if (beta[0] != ONE) @@ -294,43 +293,37 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BETA_OPERATION(m_from, m_to, range_n[mypos_n * nthreads_m], range_n[(mypos_n + 1) * nthreads_m], beta, c, ldc); } + /* Return early if no more computation is needed */ if ((k == 0) || (alpha == NULL)) return 0; - if ((alpha[0] == ZERO) #ifdef COMPLEX && (alpha[1] == ZERO) #endif ) return 0; -#if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", - mypos, m_from, m_to, n_from, n_to); - - fprintf(stderr, "GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); - -#endif - + /* Initialize workspace for local region of B */ div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; - buffer[0] = sb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; } - + /* Iterate through steps of k */ for(ls = 0; ls < k; ls += min_l){ + /* Determine step size in k */ min_l = k - ls; - if (min_l >= GEMM_Q * 2) { min_l = GEMM_Q; } else { if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; } + /* Determine step size in m + * Note: We are currently on the first step in m + */ l1stride = 1; min_i = m_to - m_from; - if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else { @@ -341,109 +334,106 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } } + /* Copy local region of A into workspace */ START_RPCC(); - ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); - STOP_RPCC(copy_A); + /* Copy local region of B into workspace and apply kernel */ div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { - for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { - + /* Make sure if no one is using workspace */ START_RPCC(); - - /* Make sure if no one is using buffer */ for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; - STOP_RPCC(waiting1); #if defined(FUSED_GEMM) && !defined(TIMING) - FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha, - sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls); + /* Fused operation to copy region of B into workspace and apply kernel */ + FUSED_KERNEL_OPERATION(min_i, MIN(n_to, js + div_n) - js, min_l, alpha, + sa, buffer[bufferside], b, ldb, c, ldc, m_from, js, ls); #else - for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ - min_jj = MIN(n_to, xxx + div_n) - jjs; - + /* Split local region of B into parts */ + for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ + min_jj = MIN(n_to, js + div_n) - jjs; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; - else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; - + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + /* Copy part of local region of B into workspace */ START_RPCC(); - OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, - buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride); - + buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride); STOP_RPCC(copy_B); + /* Apply kernel with local region of A and part of local region of B */ START_RPCC(); - KERNEL_OPERATION(min_i, min_jj, min_l, alpha, - sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride, + sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); - STOP_RPCC(kernel); #ifdef TIMING - ops += 2 * min_i * min_jj * min_l; + ops += 2 * min_i * min_jj * min_l; #endif } #endif + /* Set flag so other threads can access local region of B */ for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; WMB; } + /* Get regions of B from other threads and apply kernel */ current = mypos; - do { + + /* This thread accesses regions of B from threads in the range + * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */ current ++; if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; + /* Split other region of B into parts */ div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { + if (current != mypos) { - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - if (current != mypos) { - + /* Wait until other region of B is initialized */ START_RPCC(); - - /* thread has to wait */ while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - STOP_RPCC(waiting2); + /* Apply kernel with local region of A and part of other region of B */ START_RPCC(); - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, ldc, m_from, xxx); + c, ldc, m_from, js); + STOP_RPCC(kernel); - STOP_RPCC(kernel); #ifdef TIMING - ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; + ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; #endif } + /* Clear synchronization flag if this thread is done with other region of B */ if (m_to - m_from == min_i) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; } } } while (current != mypos); - + /* Iterate through steps of m + * Note: First step has already been finished */ for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; - if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else @@ -451,38 +441,39 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; } + /* Copy local region of A into workspace */ START_RPCC(); - ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); - STOP_RPCC(copy_A); + /* Get regions of B and apply kernel */ current = mypos; do { + /* Split region of B into parts and apply kernel */ div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + for (js = range_n[current], bufferside = 0; js < range_n[current + 1]; js += div_n, bufferside ++) { - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - + /* Apply kernel with local region of A and part of region of B */ START_RPCC(); - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, ldc, is, xxx); - - STOP_RPCC(kernel); - + c, ldc, is, js); + STOP_RPCC(kernel); + #ifdef TIMING - ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; -#endif - - if (is + min_i >= m_to) { - /* Thread doesn't need this buffer any more */ - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } + ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; +#endif + + /* Clear synchronization flag if this thread is done with region of B */ + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } } + /* This thread accesses regions of B from threads in the range + * [ mypos_n * nthreads_m, (mypos_n+1) * nthreads_m ) */ current ++; if (current >= (mypos_n + 1) * nthreads_m) current = mypos_n * nthreads_m; @@ -492,14 +483,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, } + /* Wait until all other threads are done with local region of B */ START_RPCC(); - for (i = 0; i < args -> nthreads; i++) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + for (js = 0; js < DIVIDE_RATE; js++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; } } - STOP_RPCC(waiting3); #ifdef TIMING @@ -512,17 +502,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, (double)waiting2 /(double)total * 100., (double)waiting3 /(double)total * 100., (double)ops/(double)kernel / 4. * 100.); - -#if 0 - fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", - mypos, copy_A, copy_B, waiting); - - fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", - mypos, - (double)waiting1/(double)waiting * 100., - (double)waiting2/(double)waiting * 100., - (double)waiting3/(double)waiting * 100.); -#endif fprintf(stderr, "\n"); #endif @@ -545,17 +524,16 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2]; BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2]; - BLASLONG *range_M = range_M_buffer + 1; - BLASLONG *range_N = range_N_buffer + 1; - + BLASLONG *range_M, *range_N; BLASLONG num_cpu_m, num_cpu_n; BLASLONG nthreads = args -> nthreads; BLASLONG width, i, j, k, js; BLASLONG m, n, n_from, n_to; - int mode; + int mode; + /* Get execution mode */ #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; @@ -574,6 +552,16 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG #endif #endif +#ifdef USE_ALLOC_HEAP + /* Dynamically allocate workspace */ + job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); + if(job==NULL){ + fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); + exit(1); + } +#endif + + /* Initialize struct for arguments */ newarg.m = args -> m; newarg.n = args -> n; newarg.k = args -> k; @@ -586,26 +574,19 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG newarg.alpha = args -> alpha; newarg.beta = args -> beta; newarg.nthreads = args -> nthreads; - -#ifdef USE_ALLOC_HEAP - job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); - if(job==NULL){ - fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); - exit(1); - } -#endif - newarg.common = (void *)job; - #ifdef PARAMTEST - newarg.gemm_p = args -> gemm_p; - newarg.gemm_q = args -> gemm_q; - newarg.gemm_r = args -> gemm_r; + newarg.gemm_p = args -> gemm_p; + newarg.gemm_q = args -> gemm_q; + newarg.gemm_r = args -> gemm_r; #endif + /* Initialize partitions in m and n + * Note: The number of CPU partitions is stored in the -1 entry */ + range_M = &range_M_buffer[1]; + range_N = &range_N_buffer[1]; range_M[-1] = nthreads_m; range_N[-1] = nthreads_n; - if (!range_m) { range_M[0] = 0; m = args -> m; @@ -614,24 +595,20 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG m = range_m[1] - range_m[0]; } + /* Partition m into nthreads_m regions */ num_cpu_m = 0; - while (m > 0){ - - width = blas_quickdivide(m + nthreads_m - num_cpu_m - 1, nthreads_m - num_cpu_m); - + width = blas_quickdivide(m + nthreads_m - num_cpu_m - 1, nthreads_m - num_cpu_m); m -= width; if (m < 0) width = width + m; - range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; - num_cpu_m ++; } - for (i = num_cpu_m; i < MAX_CPU_NUMBER; i++) { range_M[i + 1] = range_M[num_cpu_m]; } + /* Initialize parameters for parallel execution */ for (i = 0; i < nthreads; i++) { queue[i].mode = mode; queue[i].routine = inner_thread; @@ -642,10 +619,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG queue[i].sb = NULL; queue[i].next = &queue[i + 1]; } - queue[0].sa = sa; queue[0].sb = sb; + queue[nthreads - 1].next = NULL; + /* Iterate through steps of n */ if (!range_n) { n_from = 0; n_to = args -> n; @@ -653,41 +631,34 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG n_from = range_n[0]; n_to = range_n[1]; } - for(js = n_from; js < n_to; js += GEMM_R * nthreads){ n = n_to - js; if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; + /* Partition (a step of) n into nthreads regions */ range_N[0] = js; - num_cpu_n = 0; - while (n > 0){ - - width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); - + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); n -= width; if (n < 0) width = width + n; - range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; - num_cpu_n ++; } - for (j = num_cpu_n; j < MAX_CPU_NUMBER; j++) { range_N[j + 1] = range_N[num_cpu_n]; } - for (j = 0; j < MAX_CPU_NUMBER; j++) { - for (i = 0; i < MAX_CPU_NUMBER; i++) { + /* Clear synchronization flags */ + for (i = 0; i < MAX_CPU_NUMBER; i++) { + for (j = 0; j < MAX_CPU_NUMBER; j++) { for (k = 0; k < DIVIDE_RATE; k++) { - job[j].working[i][CACHE_LINE_SIZE * k] = 0; + job[i].working[j][CACHE_LINE_SIZE * k] = 0; } } } - queue[nthreads - 1].next = NULL; - + /* Execute parallel computation */ exec_blas(nthreads, queue); } @@ -702,53 +673,43 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG m = args -> m; BLASLONG n = args -> n; - BLASLONG nthreads = args -> nthreads; BLASLONG nthreads_m, nthreads_n; - if (nthreads == 1) { - GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); - return 0; - } - + /* Get dimensions from index ranges if available */ if (range_m) { - BLASLONG m_from = *(((BLASLONG *)range_m) + 0); - BLASLONG m_to = *(((BLASLONG *)range_m) + 1); - - m = m_to - m_from; + m = range_m[1] - range_m[0]; } - if (range_n) { - BLASLONG n_from = *(((BLASLONG *)range_n) + 0); - BLASLONG n_to = *(((BLASLONG *)range_n) + 1); - - n = n_to - n_from; + n = range_n[1] - range_n[0]; } - nthreads_m = nthreads; - while (m < nthreads_m * SWITCH_RATIO) { - nthreads_m = nthreads_m / 2; - } - - if (nthreads_m < 1) { - GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); - return 0; + /* CPU partitions in m should have at least SWITCH_RATIO rows */ + if (m < 2 * SWITCH_RATIO) { + nthreads_m = 1; + } else { + nthreads_m = args -> nthreads; + while (m < nthreads_m * SWITCH_RATIO) { + nthreads_m = nthreads_m / 2; + } } - nthreads_n = nthreads / nthreads_m; - if (n < nthreads_m * (nthreads_n - 1)) { - nthreads_n = (n + nthreads_m - 1) / nthreads_m; + /* At most one CPU partition in n should have less than nthreads_m columns */ + if (n < nthreads_m) { + nthreads_n = 1; + } else { + nthreads_n = blas_quickdivide(n + nthreads_m - 1, nthreads_m); + if (nthreads_m * nthreads_n > args -> nthreads) { + nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); + } } - nthreads = nthreads_m * nthreads_n; - - if (nthreads <= 1) { + /* Execute serial or parallel computation */ + if (nthreads_m * nthreads_n <= 1) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); - return 0; + } else { + args -> nthreads = nthreads_m * nthreads_n; + gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n); } - args -> nthreads = nthreads; - - gemm_driver(args, range_m, range_n, sa, sb, nthreads_m, nthreads_n); - return 0; } From 30486a356c2de3e0f284a8efebba891050299765 Mon Sep 17 00:00:00 2001 From: Tim Moon Date: Wed, 4 Oct 2017 12:37:49 -0700 Subject: [PATCH 3/3] Reduce number of data partitions in n. --- driver/level3/level3_thread.c | 37 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 22a12d465a..77ceac6e8a 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -525,7 +525,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG BLASLONG range_M_buffer[MAX_CPU_NUMBER + 2]; BLASLONG range_N_buffer[MAX_CPU_NUMBER + 2]; BLASLONG *range_M, *range_N; - BLASLONG num_cpu_m, num_cpu_n; + BLASLONG num_parts; BLASLONG nthreads = args -> nthreads; @@ -596,16 +596,16 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG } /* Partition m into nthreads_m regions */ - num_cpu_m = 0; + num_parts = 0; while (m > 0){ - width = blas_quickdivide(m + nthreads_m - num_cpu_m - 1, nthreads_m - num_cpu_m); + width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); m -= width; if (m < 0) width = width + m; - range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; - num_cpu_m ++; + range_M[num_parts + 1] = range_M[num_parts] + width; + num_parts ++; } - for (i = num_cpu_m; i < MAX_CPU_NUMBER; i++) { - range_M[i + 1] = range_M[num_cpu_m]; + for (i = num_parts; i < MAX_CPU_NUMBER; i++) { + range_M[i + 1] = range_M[num_parts]; } /* Initialize parameters for parallel execution */ @@ -637,16 +637,19 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG /* Partition (a step of) n into nthreads regions */ range_N[0] = js; - num_cpu_n = 0; + num_parts = 0; while (n > 0){ - width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); + width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); + if (width < SWITCH_RATIO) { + width = SWITCH_RATIO; + } n -= width; if (n < 0) width = width + n; - range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; - num_cpu_n ++; + range_N[num_parts + 1] = range_N[num_parts] + width; + num_parts ++; } - for (j = num_cpu_n; j < MAX_CPU_NUMBER; j++) { - range_N[j + 1] = range_N[num_cpu_n]; + for (j = num_parts; j < MAX_CPU_NUMBER; j++) { + range_N[j + 1] = range_N[num_parts]; } /* Clear synchronization flags */ @@ -683,7 +686,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO n = range_n[1] - range_n[0]; } - /* CPU partitions in m should have at least SWITCH_RATIO rows */ + /* Partitions in m should have at least SWITCH_RATIO rows */ if (m < 2 * SWITCH_RATIO) { nthreads_m = 1; } else { @@ -693,11 +696,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } } - /* At most one CPU partition in n should have less than nthreads_m columns */ - if (n < nthreads_m) { + /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */ + if (n < SWITCH_RATIO * nthreads_m) { nthreads_n = 1; } else { - nthreads_n = blas_quickdivide(n + nthreads_m - 1, nthreads_m); + nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m); if (nthreads_m * nthreads_n > args -> nthreads) { nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m); }