From 1b83341d194b9d8f75ec724b0c5ae64144ca3108 Mon Sep 17 00:00:00 2001 From: Zhiyong Dang Date: Tue, 24 Apr 2018 10:34:53 +0800 Subject: [PATCH 1/2] Fix race condition in blas_server_omp.c Change-Id: Ic896276cd073d6b41930c7c5a29d66348cd1725d --- Makefile.rule | 7 +++ Makefile.system | 6 +++ cmake/system.cmake | 6 +++ common.h | 2 +- driver/others/blas_server_omp.c | 91 ++++++++++++++++++++++++--------- 5 files changed, 86 insertions(+), 26 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 62bf63df47..0ce4c40a81 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -60,6 +60,13 @@ VERSION = 0.3.0.dev # automatically detected by the the script. # NUM_THREADS = 24 +# If you have enabled USE_OPENMP and your application would call +# OpenBLAS's caculation API in multi threads, please comment it in. +# This flag define how many OpenBLAS's caculation API can actually +# run in parallel. If more number threads call OpenBLAS's caculation API, +# it would wait former API finish. +# NUM_PARALLEL = 2 + # if you don't need to install the static library, please comment it in. # NO_STATIC = 1 diff --git a/Makefile.system b/Makefile.system index 142cb420f8..463b857b84 100644 --- a/Makefile.system +++ b/Makefile.system @@ -184,6 +184,10 @@ endif endif +ifndef NUM_PARALLEL +NUM_PARALLEL = 1 +endif + ifndef NUM_THREADS NUM_THREADS = $(NUM_CORES) endif @@ -961,6 +965,8 @@ endif CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) +CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL) + ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif diff --git a/cmake/system.cmake b/cmake/system.cmake index 3fdd9390cd..6458956710 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -96,6 +96,10 @@ if (NOT CMAKE_CROSSCOMPILING) endif() +if (NOT DEFINED NUM_PARALLEL) + set(NUM_PARALLEL 1) +endif() + if (NOT DEFINED NUM_THREADS) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) # HT? @@ -224,6 +228,8 @@ endif () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") +set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}") + if (USE_SIMPLE_THREADED_LEVEL3) set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") endif () diff --git a/common.h b/common.h index 5a599a5afa..86c33b2fd2 100644 --- a/common.h +++ b/common.h @@ -179,7 +179,7 @@ extern "C" { #define ALLOCA_ALIGN 63UL -#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) #ifdef NEEDBUNDERSCORE #define BLASFUNC(FUNC) FUNC##_ diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 8d62a81256..868db3b1d3 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -36,6 +36,13 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ +#if _STDC_VERSION__ >= 201112L +#ifndef _Atomic +#define _Atomic volatile +#endif +#include +#endif +#include #include #include //#include @@ -49,11 +56,16 @@ int blas_server_avail = 0; -static void * blas_thread_buffer[MAX_CPU_NUMBER]; +static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; +#if _STDC_VERSION__ >= 201112L +static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#else +static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; +#endif void goto_set_num_threads(int num_threads) { - int i=0; + int i=0, j=0; if (num_threads < 1) num_threads = blas_num_threads; @@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); //adjust buffer for each thread - for(i=0; i mode & BLAS_PTHREAD) == 0)) { pos = omp_get_thread_num(); - buffer = blas_thread_buffer[pos]; + buffer = blas_thread_buffer[buf_index][pos]; //fallback if(buffer==NULL) { @@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){ - BLASLONG i; + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; @@ -302,6 +320,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ } #endif + while(true) { + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { +#if _STDC_VERSION__ >= 201112L + _Bool inuse = false; + if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { +#else + if(blas_buffer_inuse[i] == false) { + blas_buffer_inuse[i] = true; +#endif + buf_index = i; + break; + } + } + if(i != MAX_PARALLEL_NUMBER) + break; + } + #pragma omp parallel for schedule(static) for (i = 0; i < num; i ++) { @@ -309,9 +344,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ queue[i].position = i; #endif - exec_threads(&queue[i]); + exec_threads(&queue[i], buf_index); } +#if _STDC_VERSION__ >= 201112L + atomic_store(&blas_buffer_inuse[buf_index], false); +#else + blas_buffer_inuse[buf_index] = false; +#endif + return 0; } From 894433a7c71fba89b41af08acdd8fea7b48cc666 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 27 Apr 2018 12:08:06 +0200 Subject: [PATCH 2/2] Update Makefile.rule --- Makefile.rule | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 0ce4c40a81..12734464bb 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -61,10 +61,10 @@ VERSION = 0.3.0.dev # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's caculation API in multi threads, please comment it in. -# This flag define how many OpenBLAS's caculation API can actually -# run in parallel. If more number threads call OpenBLAS's caculation API, -# it would wait former API finish. +# OpenBLAS's calculation API from multi threads, please comment it in. +# This flag defines how many instances of OpenBLAS's calculation API can +# actually run in parallel. If more threads call OpenBLAS's calculation API, +# they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 # if you don't need to install the static library, please comment it in.