36
36
/* or implied, of The University of Texas at Austin. */
37
37
/*********************************************************************/
38
38
39
+ #if _STDC_VERSION__ >= 201112L
40
+ #ifndef _Atomic
41
+ #define _Atomic volatile
42
+ #endif
43
+ #include <stdatomic.h>
44
+ #endif
45
+ #include <stdbool.h>
39
46
#include <stdio.h>
40
47
#include <stdlib.h>
41
48
//#include <sys/mman.h>
49
56
50
57
int blas_server_avail = 0 ;
51
58
52
- static void * blas_thread_buffer [MAX_CPU_NUMBER ];
59
+ static void * blas_thread_buffer [MAX_PARALLEL_NUMBER ][MAX_CPU_NUMBER ];
60
+ #if _STDC_VERSION__ >= 201112L
61
+ static atomic_bool blas_buffer_inuse [MAX_PARALLEL_NUMBER ];
62
+ #else
63
+ static _Bool blas_buffer_inuse [MAX_PARALLEL_NUMBER ];
64
+ #endif
53
65
54
66
void goto_set_num_threads (int num_threads ) {
55
67
56
- int i = 0 ;
68
+ int i = 0 , j = 0 ;
57
69
58
70
if (num_threads < 1 ) num_threads = blas_num_threads ;
59
71
@@ -68,15 +80,17 @@ void goto_set_num_threads(int num_threads) {
68
80
omp_set_num_threads (blas_cpu_number );
69
81
70
82
//adjust buffer for each thread
71
- for (i = 0 ; i < blas_cpu_number ; i ++ ){
72
- if (blas_thread_buffer [i ]== NULL ){
73
- blas_thread_buffer [i ]= blas_memory_alloc (2 );
83
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
84
+ for (j = 0 ; j < blas_cpu_number ; j ++ ){
85
+ if (blas_thread_buffer [i ][j ]== NULL ){
86
+ blas_thread_buffer [i ][j ]= blas_memory_alloc (2 );
87
+ }
74
88
}
75
- }
76
- for (; i < MAX_CPU_NUMBER ; i ++ ){
77
- if (blas_thread_buffer [i ]!= NULL ){
78
- blas_memory_free ( blas_thread_buffer [i ]) ;
79
- blas_thread_buffer [ i ] = NULL ;
89
+ for (; j < MAX_CPU_NUMBER ; j ++ ){
90
+ if ( blas_thread_buffer [ i ][ j ] != NULL ){
91
+ blas_memory_free (blas_thread_buffer [i ][ j ]);
92
+ blas_thread_buffer [i ][ j ] = NULL ;
93
+ }
80
94
}
81
95
}
82
96
#if defined(ARCH_MIPS64 )
@@ -92,30 +106,34 @@ void openblas_set_num_threads(int num_threads) {
92
106
93
107
int blas_thread_init (void ){
94
108
95
- int i = 0 ;
109
+ int i = 0 , j = 0 ;
96
110
97
111
blas_get_cpu_number ();
98
112
99
113
blas_server_avail = 1 ;
100
114
101
- for (i = 0 ; i < blas_num_threads ; i ++ ){
102
- blas_thread_buffer [i ]= blas_memory_alloc (2 );
103
- }
104
- for (; i < MAX_CPU_NUMBER ; i ++ ){
105
- blas_thread_buffer [i ]= NULL ;
115
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
116
+ for (j = 0 ; j < blas_num_threads ; j ++ ){
117
+ blas_thread_buffer [i ][j ]= blas_memory_alloc (2 );
118
+ }
119
+ for (; j < MAX_CPU_NUMBER ; j ++ ){
120
+ blas_thread_buffer [i ][j ]= NULL ;
121
+ }
106
122
}
107
123
108
124
return 0 ;
109
125
}
110
126
111
127
int BLASFUNC (blas_thread_shutdown )(void ){
112
- int i = 0 ;
128
+ int i = 0 , j = 0 ;
113
129
blas_server_avail = 0 ;
114
130
115
- for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
116
- if (blas_thread_buffer [i ]!= NULL ){
117
- blas_memory_free (blas_thread_buffer [i ]);
118
- blas_thread_buffer [i ]= NULL ;
131
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
132
+ for (j = 0 ; j < MAX_CPU_NUMBER ; j ++ ){
133
+ if (blas_thread_buffer [i ][j ]!= NULL ){
134
+ blas_memory_free (blas_thread_buffer [i ][j ]);
135
+ blas_thread_buffer [i ][j ]= NULL ;
136
+ }
119
137
}
120
138
}
121
139
@@ -206,7 +224,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
206
224
}
207
225
}
208
226
209
- static void exec_threads (blas_queue_t * queue ){
227
+ static void exec_threads (blas_queue_t * queue , int buf_index ){
210
228
211
229
void * buffer , * sa , * sb ;
212
230
int pos = 0 , release_flag = 0 ;
@@ -223,7 +241,7 @@ static void exec_threads(blas_queue_t *queue){
223
241
if ((sa == NULL ) && (sb == NULL ) && ((queue -> mode & BLAS_PTHREAD ) == 0 )) {
224
242
225
243
pos = omp_get_thread_num ();
226
- buffer = blas_thread_buffer [pos ];
244
+ buffer = blas_thread_buffer [buf_index ][ pos ];
227
245
228
246
//fallback
229
247
if (buffer == NULL ) {
@@ -291,7 +309,7 @@ static void exec_threads(blas_queue_t *queue){
291
309
292
310
int exec_blas (BLASLONG num , blas_queue_t * queue ){
293
311
294
- BLASLONG i ;
312
+ BLASLONG i , buf_index ;
295
313
296
314
if ((num <= 0 ) || (queue == NULL )) return 0 ;
297
315
@@ -302,16 +320,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
302
320
}
303
321
#endif
304
322
323
+ while (true) {
324
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
325
+ #if _STDC_VERSION__ >= 201112L
326
+ _Bool inuse = false;
327
+ if (atomic_compare_exchange_weak (& blas_buffer_inuse [i ], & inuse , true)) {
328
+ #else
329
+ if (blas_buffer_inuse [i ] == false) {
330
+ blas_buffer_inuse [i ] = true;
331
+ #endif
332
+ buf_index = i ;
333
+ break ;
334
+ }
335
+ }
336
+ if (i != MAX_PARALLEL_NUMBER )
337
+ break ;
338
+ }
339
+
305
340
#pragma omp parallel for schedule(static)
306
341
for (i = 0 ; i < num ; i ++ ) {
307
342
308
343
#ifndef USE_SIMPLE_THREADED_LEVEL3
309
344
queue [i ].position = i ;
310
345
#endif
311
346
312
- exec_threads (& queue [i ]);
347
+ exec_threads (& queue [i ], buf_index );
313
348
}
314
349
350
+ #if _STDC_VERSION__ >= 201112L
351
+ atomic_store (& blas_buffer_inuse [buf_index ], false);
352
+ #else
353
+ blas_buffer_inuse [buf_index ] = false;
354
+ #endif
355
+
315
356
return 0 ;
316
357
}
317
358
0 commit comments