36
36
/* or implied, of The University of Texas at Austin. */
37
37
/*********************************************************************/
38
38
39
+ #ifndef _Atomic
40
+ #define _Atomic volatile
41
+ #endif
42
+ #include <stdatomic.h>
43
+ #include <stdbool.h>
39
44
#include <stdio.h>
40
45
#include <stdlib.h>
41
46
//#include <sys/mman.h>
49
54
50
55
int blas_server_avail = 0 ;
51
56
52
- static void * blas_thread_buffer [MAX_CPU_NUMBER ];
57
+ static void * blas_thread_buffer [MAX_PARALLEL_NUMBER ][MAX_CPU_NUMBER ];
58
+ static atomic_bool blas_buffer_inuse [MAX_PARALLEL_NUMBER ];
53
59
54
60
void goto_set_num_threads (int num_threads ) {
55
61
56
- int i = 0 ;
62
+ int i = 0 , j = 0 ;
57
63
58
64
if (num_threads < 1 ) num_threads = blas_num_threads ;
59
65
@@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
68
74
omp_set_num_threads (blas_cpu_number );
69
75
70
76
//adjust buffer for each thread
71
- for (i = 0 ; i < blas_cpu_number ; i ++ ){
72
- if (blas_thread_buffer [i ]== NULL ){
73
- blas_thread_buffer [i ]= blas_memory_alloc (2 );
77
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
78
+ for (j = 0 ; j < blas_cpu_number ; j ++ ){
79
+ if (blas_thread_buffer [i ][j ]== NULL ){
80
+ blas_thread_buffer [i ][j ]= blas_memory_alloc (2 );
81
+ }
74
82
}
75
- }
76
- for (; i < MAX_CPU_NUMBER ; i ++ ){
77
- if (blas_thread_buffer [i ]!= NULL ){
78
- blas_memory_free ( blas_thread_buffer [i ]) ;
79
- blas_thread_buffer [ i ] = NULL ;
83
+ for (; j < MAX_CPU_NUMBER ; j ++ ){
84
+ if ( blas_thread_buffer [ i ][ j ] != NULL ){
85
+ blas_memory_free (blas_thread_buffer [i ][ j ]);
86
+ blas_thread_buffer [i ][ j ] = NULL ;
87
+ }
80
88
}
81
89
}
82
90
#if defined(ARCH_MIPS64 )
@@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
92
100
93
101
int blas_thread_init (void ){
94
102
95
- int i = 0 ;
103
+ int i = 0 , j = 0 ;
96
104
97
105
blas_get_cpu_number ();
98
106
99
107
blas_server_avail = 1 ;
100
108
101
- for (i = 0 ; i < blas_num_threads ; i ++ ){
102
- blas_thread_buffer [i ]= blas_memory_alloc (2 );
103
- }
104
- for (; i < MAX_CPU_NUMBER ; i ++ ){
105
- blas_thread_buffer [i ]= NULL ;
109
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
110
+ for (j = 0 ; j < blas_num_threads ; j ++ ){
111
+ blas_thread_buffer [i ][j ]= blas_memory_alloc (2 );
112
+ }
113
+ for (; j < MAX_CPU_NUMBER ; j ++ ){
114
+ blas_thread_buffer [i ][j ]= NULL ;
115
+ }
106
116
}
107
117
108
118
return 0 ;
109
119
}
110
120
111
121
int BLASFUNC (blas_thread_shutdown )(void ){
112
- int i = 0 ;
122
+ int i = 0 , j = 0 ;
113
123
blas_server_avail = 0 ;
114
124
115
- for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
116
- if (blas_thread_buffer [i ]!= NULL ){
117
- blas_memory_free (blas_thread_buffer [i ]);
118
- blas_thread_buffer [i ]= NULL ;
125
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
126
+ for (j = 0 ; j < MAX_CPU_NUMBER ; j ++ ){
127
+ if (blas_thread_buffer [i ][j ]!= NULL ){
128
+ blas_memory_free (blas_thread_buffer [i ][j ]);
129
+ blas_thread_buffer [i ][j ]= NULL ;
130
+ }
119
131
}
120
132
}
121
133
@@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
206
218
}
207
219
}
208
220
209
- static void exec_threads (blas_queue_t * queue ){
221
+ static void exec_threads (blas_queue_t * queue , int buf_index ){
210
222
211
223
void * buffer , * sa , * sb ;
212
224
int pos = 0 , release_flag = 0 ;
@@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
223
235
if ((sa == NULL ) && (sb == NULL ) && ((queue -> mode & BLAS_PTHREAD ) == 0 )) {
224
236
225
237
pos = omp_get_thread_num ();
226
- buffer = blas_thread_buffer [pos ];
238
+ buffer = blas_thread_buffer [buf_index ][ pos ];
227
239
228
240
//fallback
229
241
if (buffer == NULL ) {
@@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
291
303
292
304
int exec_blas (BLASLONG num , blas_queue_t * queue ){
293
305
294
- BLASLONG i ;
306
+ BLASLONG i , buf_index ;
295
307
296
308
if ((num <= 0 ) || (queue == NULL )) return 0 ;
297
309
@@ -302,16 +314,30 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
302
314
}
303
315
#endif
304
316
317
+ while (true) {
318
+ for (i = 0 ; i < MAX_PARALLEL_NUMBER ; i ++ ) {
319
+ _Bool inuse = false;
320
+ if (atomic_compare_exchange_weak (& blas_buffer_inuse [i ], & inuse , true)) {
321
+ buf_index = i ;
322
+ break ;
323
+ }
324
+ }
325
+ if (i != MAX_PARALLEL_NUMBER )
326
+ break ;
327
+ }
328
+
305
329
#pragma omp parallel for schedule(static)
306
330
for (i = 0 ; i < num ; i ++ ) {
307
331
308
332
#ifndef USE_SIMPLE_THREADED_LEVEL3
309
333
queue [i ].position = i ;
310
334
#endif
311
335
312
- exec_threads (& queue [i ]);
336
+ exec_threads (& queue [i ], buf_index );
313
337
}
314
338
339
+ atomic_store (& blas_buffer_inuse [buf_index ], false);
340
+
315
341
return 0 ;
316
342
}
317
343
0 commit comments