From 7d1becc575d436039f1484259a10413aade9cda9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 14:18:36 +0200 Subject: [PATCH 1/7] Allocate an auxiliary struct when running out of preconfigured threads --- driver/others/memory.c | 145 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 3 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 460a3d557d..377e073ee3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2636,8 +2636,25 @@ static volatile struct { } memory[NUM_BUFFERS]; -static int memory_initialized = 0; +static volatile struct newmemstruct +{ + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif +}; +static volatile struct newmemstruct *newmemory; + +static int memory_initialized = 0; +static int memory_overflowed = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ @@ -2779,6 +2796,29 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif + if (memory_overflowed) { +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + do { + RMB; +#if defined(USE_OPENMP) + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); +#endif + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + +#if defined(USE_OPENMP) + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } +#endif + position ++; + + } while (position < 512+NUM_BUFFERS); +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif +} goto error; allocation : @@ -2883,6 +2923,90 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: + if (memory_overflowed) goto terminate; + printf("num_buffers exceeded, adding auxiliary array\n"); + memory_overflowed=1; + newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (int i=0;i<512;i++) { + newmemory[i].addr = (void *)0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[i].pos = -1; +#endif + newmemory[i].used = 0; + newmemory[i].lock = 0; +} + newmemory[position-NUM_BUFFERS].used = 1; + +allocation2: + newmemory[position-NUM_BUFFERS].used = 1; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#else + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); +#endif + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + newmemory[position-NUM_BUFFERS].addr = map_address; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); +//#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; + +#endif + return (void *)newmemory[position-NUM_BUFFERS].addr; + +terminate: printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); @@ -2907,13 +3031,28 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (position >= NUM_BUFFERS) goto error; + if (position >= NUM_BUFFERS && !memory_overflowed) goto error; #ifdef DEBUG if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif + if (memory_overflowed) { + while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + position++; + // arm: ensure all writes are finished before other thread takes this memory + WMB; + newmemory[position].used = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +//#ifdef DEBUG + printf("Unmap from overflow area succeeded.\n\n"); +//#endif + return; +} else { // arm: ensure all writes are finished before other thread takes this memory WMB; @@ -2927,7 +3066,7 @@ void blas_memory_free(void *free_area){ #endif return; - +} error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); From b4b952eece8344fe5d7adf2352791ab81d0d1d8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:03:53 +0200 Subject: [PATCH 2/7] Add auxiliary tracking space for thread buffer frees too --- driver/others/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 377e073ee3..d4fdfa4659 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2060,6 +2060,7 @@ struct release_t { int hugetlb_allocated = 0; static struct release_t release_info[NUM_BUFFERS]; +static struct release_t *new_release_info; static int release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -2110,8 +2111,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2274,8 +2280,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + { else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2307,8 +2318,13 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; + } release_pos ++; } @@ -2341,8 +2357,13 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; + } release_pos ++; } @@ -2370,8 +2391,13 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; + } release_pos ++; } @@ -2414,9 +2440,15 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; + } release_pos ++; } @@ -2450,9 +2482,15 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = shmid; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; + } release_pos ++; } @@ -2556,8 +2594,13 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; + } release_pos ++; } @@ -2604,9 +2647,15 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { + if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; + } release_pos ++; } @@ -2663,6 +2712,8 @@ static int memory_overflowed = 0; void *blas_memory_alloc(int procpos){ + int i; + int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos = 0; @@ -2926,8 +2977,9 @@ void *blas_memory_alloc(int procpos){ if (memory_overflowed) goto terminate; printf("num_buffers exceeded, adding auxiliary array\n"); memory_overflowed=1; - newmemory= (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); - for (int i=0;i<512;i++) { + new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (i = 0; i < 512; i++) { newmemory[i].addr = (void *)0; #if defined(WHEREAMI) && !defined(USE_OPENMP) newmemory[i].pos = -1; @@ -3101,7 +3153,10 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { + if (pos < NUM_BUFFERS) release_info[pos].func(&release_info[pos]); + else + new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); } #ifdef SEEK_ADDRESS @@ -3118,6 +3173,15 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } + if (memory_overflowed) + for (pos = 0; pos < 512; pos ++){ + newmemory[pos].addr = (void *)0; + newmemory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[pos].pos = -1; +#endif + newmemory[pos].lock = 0; + } UNLOCK_COMMAND(&alloc_lock); From 2ba9a567aaaac875be19a76009853b2ee4597dbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 28 Aug 2021 17:14:59 +0200 Subject: [PATCH 3/7] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index d4fdfa4659..3825e83aee 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2283,7 +2283,7 @@ static void *alloc_mmap(void *address){ if (release_pos < NUM_BUFFERS) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; - { else { + } else { new_release_info[release_pos-NUM_BUFFERS].address = map_address; new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; } From 7fd12a5e69164b62dad7fbddf1581d941e5339fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 13:54:51 +0200 Subject: [PATCH 4/7] Add likely() hints for gcc --- driver/others/memory.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 3825e83aee..689aba9423 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif + #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS @@ -2111,7 +2119,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2280,7 +2288,7 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; } else { @@ -2318,7 +2326,7 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; } else { @@ -2357,7 +2365,7 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; } else { @@ -2391,7 +2399,7 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; } else { @@ -2440,7 +2448,7 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; @@ -2482,7 +2490,7 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; @@ -2594,7 +2602,7 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; } else { @@ -2647,7 +2655,7 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { - if (release_pos < NUM_BUFFERS) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; @@ -3153,7 +3161,7 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { - if (pos < NUM_BUFFERS) + if (likely(pos < NUM_BUFFERS)) release_info[pos].func(&release_info[pos]); else new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); From 89fc5b8f4f1c56b50896773e667c3a215342e49c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Aug 2021 19:50:24 +0200 Subject: [PATCH 5/7] Fix unmap logic --- driver/others/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 689aba9423..1f66ef9e97 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -76,8 +76,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef likely #ifdef __GNUC__ #define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) #else #define likely(x) (x) +#define unlikely(x) (x) #endif #endif @@ -3097,7 +3099,7 @@ void blas_memory_free(void *free_area){ if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif - if (memory_overflowed) { + if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) position++; // arm: ensure all writes are finished before other thread takes this memory From 2db1a99aca0177761f47daa71b27450923eb127e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:21:25 +0200 Subject: [PATCH 6/7] Clean up debug messages --- driver/others/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 1f66ef9e97..c560c4e904 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - printf("num_buffers exceeded, adding auxiliary array\n"); + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); @@ -3057,9 +3057,9 @@ void *blas_memory_alloc(int procpos){ UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); -//#endif +#endif #if defined(WHEREAMI) && !defined(USE_OPENMP) @@ -3110,9 +3110,9 @@ void blas_memory_free(void *free_area){ UNLOCK_COMMAND(&alloc_lock); #endif -//#ifdef DEBUG +#ifdef DEBUG printf("Unmap from overflow area succeeded.\n\n"); -//#endif +#endif return; } else { // arm: ensure all writes are finished before other thread takes this memory From cd10d1c03be5ecbdf8bda6e448a6cac27f8aa1be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Aug 2021 14:38:28 +0200 Subject: [PATCH 7/7] Fix typo --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index c560c4e904..48067923e3 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2985,7 +2985,7 @@ void *blas_memory_alloc(int procpos){ error: if (memory_overflowed) goto terminate; - fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n") + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); memory_overflowed=1; new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));