@@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
73
73
74
74
#include "common.h"
75
75
76
+ #ifndef likely
77
+ #ifdef __GNUC__
78
+ #define likely (x ) __builtin_expect(!!(x), 1)
79
+ #define unlikely (x ) __builtin_expect(!!(x), 0)
80
+ #else
81
+ #define likely (x ) (x)
82
+ #define unlikely (x ) (x)
83
+ #endif
84
+ #endif
85
+
76
86
#if defined(USE_TLS ) && defined(SMP )
77
87
#define COMPILE_TLS
78
88
@@ -2060,6 +2070,7 @@ struct release_t {
2060
2070
int hugetlb_allocated = 0 ;
2061
2071
2062
2072
static struct release_t release_info [NUM_BUFFERS ];
2073
+ static struct release_t * new_release_info ;
2063
2074
static int release_pos = 0 ;
2064
2075
2065
2076
#if defined(OS_LINUX ) && !defined(NO_WARMUP )
@@ -2110,8 +2121,13 @@ static void *alloc_mmap(void *address){
2110
2121
#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2111
2122
LOCK_COMMAND (& alloc_lock );
2112
2123
#endif
2124
+ if (likely (release_pos < NUM_BUFFERS )) {
2113
2125
release_info [release_pos ].address = map_address ;
2114
2126
release_info [release_pos ].func = alloc_mmap_free ;
2127
+ } else {
2128
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2129
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_mmap_free ;
2130
+ }
2115
2131
release_pos ++ ;
2116
2132
#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2117
2133
UNLOCK_COMMAND (& alloc_lock );
@@ -2274,8 +2290,13 @@ static void *alloc_mmap(void *address){
2274
2290
#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2275
2291
LOCK_COMMAND (& alloc_lock );
2276
2292
#endif
2293
+ if (likely (release_pos < NUM_BUFFERS )) {
2277
2294
release_info [release_pos ].address = map_address ;
2278
2295
release_info [release_pos ].func = alloc_mmap_free ;
2296
+ } else {
2297
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2298
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_mmap_free ;
2299
+ }
2279
2300
release_pos ++ ;
2280
2301
#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2281
2302
UNLOCK_COMMAND (& alloc_lock );
@@ -2307,8 +2328,13 @@ static void *alloc_malloc(void *address){
2307
2328
if (map_address == (void * )NULL ) map_address = (void * )-1 ;
2308
2329
2309
2330
if (map_address != (void * )-1 ) {
2331
+ if (likely (release_pos < NUM_BUFFERS )) {
2310
2332
release_info [release_pos ].address = map_address ;
2311
2333
release_info [release_pos ].func = alloc_malloc_free ;
2334
+ } else {
2335
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2336
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_malloc_free ;
2337
+ }
2312
2338
release_pos ++ ;
2313
2339
}
2314
2340
@@ -2341,8 +2367,13 @@ static void *alloc_qalloc(void *address){
2341
2367
if (map_address == (void * )NULL ) map_address = (void * )-1 ;
2342
2368
2343
2369
if (map_address != (void * )-1 ) {
2370
+ if (likely (release_pos < NUM_BUFFERS )) {
2344
2371
release_info [release_pos ].address = map_address ;
2345
2372
release_info [release_pos ].func = alloc_qalloc_free ;
2373
+ } else {
2374
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2375
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_qalloc_free ;
2376
+ }
2346
2377
release_pos ++ ;
2347
2378
}
2348
2379
@@ -2370,8 +2401,13 @@ static void *alloc_windows(void *address){
2370
2401
if (map_address == (void * )NULL ) map_address = (void * )-1 ;
2371
2402
2372
2403
if (map_address != (void * )-1 ) {
2404
+ if (likely (release_pos < NUM_BUFFERS )) {
2373
2405
release_info [release_pos ].address = map_address ;
2374
2406
release_info [release_pos ].func = alloc_windows_free ;
2407
+ } else {
2408
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2409
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_windows_free ;
2410
+ }
2375
2411
release_pos ++ ;
2376
2412
}
2377
2413
@@ -2414,9 +2450,15 @@ static void *alloc_devicedirver(void *address){
2414
2450
fd , 0 );
2415
2451
2416
2452
if (map_address != (void * )-1 ) {
2453
+ if (likely (release_pos < NUM_BUFFERS )) {
2417
2454
release_info [release_pos ].address = map_address ;
2418
2455
release_info [release_pos ].attr = fd ;
2419
2456
release_info [release_pos ].func = alloc_devicedirver_free ;
2457
+ } else {
2458
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2459
+ new_release_info [release_pos - NUM_BUFFERS ].attr = fd ;
2460
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_devicedirver_free ;
2461
+ }
2420
2462
release_pos ++ ;
2421
2463
}
2422
2464
@@ -2450,9 +2492,15 @@ static void *alloc_shm(void *address){
2450
2492
2451
2493
shmctl (shmid , IPC_RMID , 0 );
2452
2494
2495
+ if (likely (release_pos < NUM_BUFFERS )) {
2453
2496
release_info [release_pos ].address = map_address ;
2454
2497
release_info [release_pos ].attr = shmid ;
2455
2498
release_info [release_pos ].func = alloc_shm_free ;
2499
+ } else {
2500
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2501
+ new_release_info [release_pos - NUM_BUFFERS ].attr = shmid ;
2502
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_shm_free ;
2503
+ }
2456
2504
release_pos ++ ;
2457
2505
}
2458
2506
@@ -2556,8 +2604,13 @@ static void *alloc_hugetlb(void *address){
2556
2604
#endif
2557
2605
2558
2606
if (map_address != (void * )-1 ){
2607
+ if (likely (release_pos < NUM_BUFFERS )) {
2559
2608
release_info [release_pos ].address = map_address ;
2560
2609
release_info [release_pos ].func = alloc_hugetlb_free ;
2610
+ } else {
2611
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2612
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_hugetlb_free ;
2613
+ }
2561
2614
release_pos ++ ;
2562
2615
}
2563
2616
@@ -2604,9 +2657,15 @@ static void *alloc_hugetlbfile(void *address){
2604
2657
fd , 0 );
2605
2658
2606
2659
if (map_address != (void * )-1 ) {
2660
+ if (likely (release_pos < NUM_BUFFERS )) {
2607
2661
release_info [release_pos ].address = map_address ;
2608
2662
release_info [release_pos ].attr = fd ;
2609
2663
release_info [release_pos ].func = alloc_hugetlbfile_free ;
2664
+ } else {
2665
+ new_release_info [release_pos - NUM_BUFFERS ].address = map_address ;
2666
+ new_release_info [release_pos - NUM_BUFFERS ].attr = fd ;
2667
+ new_release_info [release_pos - NUM_BUFFERS ].func = alloc_hugetlbfile_free ;
2668
+ }
2610
2669
release_pos ++ ;
2611
2670
}
2612
2671
@@ -2636,8 +2695,25 @@ static volatile struct {
2636
2695
2637
2696
} memory [NUM_BUFFERS ];
2638
2697
2639
- static int memory_initialized = 0 ;
2698
+ static volatile struct newmemstruct
2699
+ {
2700
+ BLASULONG lock ;
2701
+ void * addr ;
2702
+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
2703
+ int pos ;
2704
+ #endif
2705
+ int used ;
2706
+ #ifndef __64BIT__
2707
+ char dummy [48 ];
2708
+ #else
2709
+ char dummy [40 ];
2710
+ #endif
2711
+
2712
+ } ;
2713
+ static volatile struct newmemstruct * newmemory ;
2640
2714
2715
+ static int memory_initialized = 0 ;
2716
+ static int memory_overflowed = 0 ;
2641
2717
/* Memory allocation routine */
2642
2718
/* procpos ... indicates where it comes from */
2643
2719
/* 0 : Level 3 functions */
@@ -2646,6 +2722,8 @@ static int memory_initialized = 0;
2646
2722
2647
2723
void * blas_memory_alloc (int procpos ){
2648
2724
2725
+ int i ;
2726
+
2649
2727
int position ;
2650
2728
#if defined(WHEREAMI ) && !defined(USE_OPENMP )
2651
2729
int mypos = 0 ;
@@ -2779,6 +2857,29 @@ void *blas_memory_alloc(int procpos){
2779
2857
#if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2780
2858
UNLOCK_COMMAND (& alloc_lock );
2781
2859
#endif
2860
+ if (memory_overflowed ) {
2861
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2862
+ LOCK_COMMAND (& alloc_lock );
2863
+ #endif
2864
+ do {
2865
+ RMB ;
2866
+ #if defined(USE_OPENMP )
2867
+ if (!newmemory [position - NUM_BUFFERS ].used ) {
2868
+ blas_lock (& newmemory [position - NUM_BUFFERS ].lock );
2869
+ #endif
2870
+ if (!newmemory [position - NUM_BUFFERS ].used ) goto allocation2 ;
2871
+
2872
+ #if defined(USE_OPENMP )
2873
+ blas_unlock (& newmemory [position - NUM_BUFFERS ].lock );
2874
+ }
2875
+ #endif
2876
+ position ++ ;
2877
+
2878
+ } while (position < 512 + NUM_BUFFERS );
2879
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
2880
+ UNLOCK_COMMAND (& alloc_lock );
2881
+ #endif
2882
+ }
2782
2883
goto error ;
2783
2884
2784
2885
allocation :
@@ -2883,6 +2984,91 @@ void *blas_memory_alloc(int procpos){
2883
2984
return (void * )memory [position ].addr ;
2884
2985
2885
2986
error :
2987
+ if (memory_overflowed ) goto terminate ;
2988
+ fprintf (stderr ,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n" );
2989
+ memory_overflowed = 1 ;
2990
+ new_release_info = (struct release_t * ) malloc (512 * sizeof (struct release_t ));
2991
+ newmemory = (struct newmemstruct * ) malloc (512 * sizeof (struct newmemstruct ));
2992
+ for (i = 0 ; i < 512 ; i ++ ) {
2993
+ newmemory [i ].addr = (void * )0 ;
2994
+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
2995
+ newmemory [i ].pos = -1 ;
2996
+ #endif
2997
+ newmemory [i ].used = 0 ;
2998
+ newmemory [i ].lock = 0 ;
2999
+ }
3000
+ newmemory [position - NUM_BUFFERS ].used = 1 ;
3001
+
3002
+ allocation2 :
3003
+ newmemory [position - NUM_BUFFERS ].used = 1 ;
3004
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3005
+ UNLOCK_COMMAND (& alloc_lock );
3006
+ #else
3007
+ blas_unlock (& newmemory [position - NUM_BUFFERS ].lock );
3008
+ #endif
3009
+ do {
3010
+ #ifdef DEBUG
3011
+ printf ("Allocation Start : %lx\n" , base_address );
3012
+ #endif
3013
+
3014
+ map_address = (void * )-1 ;
3015
+
3016
+ func = & memoryalloc [0 ];
3017
+
3018
+ while ((func != NULL ) && (map_address == (void * ) -1 )) {
3019
+
3020
+ map_address = (* func )((void * )base_address );
3021
+
3022
+ #ifdef ALLOC_DEVICEDRIVER
3023
+ if ((* func == alloc_devicedirver ) && (map_address == (void * )-1 )) {
3024
+ fprintf (stderr , "OpenBLAS Warning ... Physically contiguous allocation was failed.\n" );
3025
+ }
3026
+ #endif
3027
+
3028
+ #ifdef ALLOC_HUGETLBFILE
3029
+ if ((* func == alloc_hugetlbfile ) && (map_address == (void * )-1 )) {
3030
+ #ifndef OS_WINDOWS
3031
+ fprintf (stderr , "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n" );
3032
+ #endif
3033
+ }
3034
+ #endif
3035
+
3036
+ #if (defined ALLOC_SHM ) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS )
3037
+ if ((* func == alloc_hugetlb ) && (map_address != (void * )-1 )) hugetlb_allocated = 1 ;
3038
+ #endif
3039
+
3040
+ func ++ ;
3041
+ }
3042
+
3043
+ #ifdef DEBUG
3044
+ printf (" Success -> %08lx\n" , map_address );
3045
+ #endif
3046
+ if (((BLASLONG ) map_address ) == -1 ) base_address = 0UL ;
3047
+
3048
+ if (base_address ) base_address += BUFFER_SIZE + FIXED_PAGESIZE ;
3049
+
3050
+ } while ((BLASLONG )map_address == -1 );
3051
+
3052
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3053
+ LOCK_COMMAND (& alloc_lock );
3054
+ #endif
3055
+ newmemory [position - NUM_BUFFERS ].addr = map_address ;
3056
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3057
+ UNLOCK_COMMAND (& alloc_lock );
3058
+ #endif
3059
+
3060
+ #ifdef DEBUG
3061
+ printf (" Mapping Succeeded. %p(%d)\n" , (void * )newmemory [position - NUM_BUFFERS ].addr , position );
3062
+ #endif
3063
+
3064
+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
3065
+
3066
+ if (newmemory [position - NUM_BUFFERS ].pos == -1 ) newmemory [position - NUM_BUFFERS ].pos = mypos ;
3067
+
3068
+ #endif
3069
+ return (void * )newmemory [position - NUM_BUFFERS ].addr ;
3070
+
3071
+ terminate :
2886
3072
printf ("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n" );
2887
3073
printf ("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n" , NUM_BUFFERS );
2888
3074
printf ("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n" );
@@ -2907,13 +3093,28 @@ void blas_memory_free(void *free_area){
2907
3093
while ((position < NUM_BUFFERS ) && (memory [position ].addr != free_area ))
2908
3094
position ++ ;
2909
3095
2910
- if (position >= NUM_BUFFERS ) goto error ;
3096
+ if (position >= NUM_BUFFERS && ! memory_overflowed ) goto error ;
2911
3097
2912
3098
#ifdef DEBUG
2913
3099
if (memory [position ].addr != free_area ) goto error ;
2914
3100
printf (" Position : %d\n" , position );
2915
3101
#endif
3102
+ if (unlikely (memory_overflowed && position >= NUM_BUFFERS )) {
3103
+ while ((position < NUM_BUFFERS + 512 ) && (newmemory [position - NUM_BUFFERS ].addr != free_area ))
3104
+ position ++ ;
3105
+ // arm: ensure all writes are finished before other thread takes this memory
3106
+ WMB ;
3107
+
3108
+ newmemory [position ].used = 0 ;
3109
+ #if (defined(SMP ) || defined(USE_LOCKING )) && !defined(USE_OPENMP )
3110
+ UNLOCK_COMMAND (& alloc_lock );
3111
+ #endif
2916
3112
3113
+ #ifdef DEBUG
3114
+ printf ("Unmap from overflow area succeeded.\n\n" );
3115
+ #endif
3116
+ return ;
3117
+ } else {
2917
3118
// arm: ensure all writes are finished before other thread takes this memory
2918
3119
WMB ;
2919
3120
@@ -2927,7 +3128,7 @@ void blas_memory_free(void *free_area){
2927
3128
#endif
2928
3129
2929
3130
return ;
2930
-
3131
+ }
2931
3132
error :
2932
3133
printf ("BLAS : Bad memory unallocation! : %4d %p\n" , position , free_area );
2933
3134
@@ -2962,7 +3163,10 @@ void blas_shutdown(void){
2962
3163
LOCK_COMMAND (& alloc_lock );
2963
3164
2964
3165
for (pos = 0 ; pos < release_pos ; pos ++ ) {
3166
+ if (likely (pos < NUM_BUFFERS ))
2965
3167
release_info [pos ].func (& release_info [pos ]);
3168
+ else
3169
+ new_release_info [pos - NUM_BUFFERS ].func (& new_release_info [pos - NUM_BUFFERS ]);
2966
3170
}
2967
3171
2968
3172
#ifdef SEEK_ADDRESS
@@ -2979,6 +3183,15 @@ void blas_shutdown(void){
2979
3183
#endif
2980
3184
memory [pos ].lock = 0 ;
2981
3185
}
3186
+ if (memory_overflowed )
3187
+ for (pos = 0 ; pos < 512 ; pos ++ ){
3188
+ newmemory [pos ].addr = (void * )0 ;
3189
+ newmemory [pos ].used = 0 ;
3190
+ #if defined(WHEREAMI ) && !defined(USE_OPENMP )
3191
+ newmemory [pos ].pos = -1 ;
3192
+ #endif
3193
+ newmemory [pos ].lock = 0 ;
3194
+ }
2982
3195
2983
3196
UNLOCK_COMMAND (& alloc_lock );
2984
3197
0 commit comments