@@ -115,8 +115,6 @@ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
115
115
116
116
int blas_omp_threads_local = 1 ;
117
117
118
- static void * blas_thread_buffer [MAX_CPU_NUMBER ];
119
-
120
118
/* Local Variables */
121
119
#if defined(USE_PTHREAD_LOCK )
122
120
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER ;
@@ -192,10 +190,6 @@ static int main_status[MAX_CPU_NUMBER];
192
190
BLASLONG exit_time [MAX_CPU_NUMBER ];
193
191
#endif
194
192
195
- //Prototypes
196
- static void exec_threads (int , blas_queue_t * , int );
197
- static void adjust_thread_buffers ();
198
-
199
193
static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
200
194
201
195
if (!(mode & BLAS_COMPLEX )){
@@ -381,6 +375,7 @@ static void* blas_thread_server(void *arg){
381
375
/* Thread identifier */
382
376
BLASLONG cpu = (BLASLONG )arg ;
383
377
unsigned int last_tick ;
378
+ void * buffer , * sa , * sb ;
384
379
blas_queue_t * queue ;
385
380
386
381
blas_queue_t * tscq ;
@@ -400,6 +395,8 @@ blas_queue_t *tscq;
400
395
main_status [cpu ] = MAIN_ENTER ;
401
396
#endif
402
397
398
+ buffer = blas_memory_alloc (2 );
399
+
403
400
#ifdef SMP_DEBUG
404
401
fprintf (STDERR , "Server[%2ld] Thread has just been spawned!\n" , cpu );
405
402
#endif
@@ -460,8 +457,92 @@ blas_queue_t *tscq;
460
457
#endif
461
458
462
459
if (queue ) {
460
+ int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
463
461
464
- exec_threads (cpu , queue , 0 );
462
+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
463
+
464
+ sa = queue -> sa ;
465
+ sb = queue -> sb ;
466
+
467
+ #ifdef SMP_DEBUG
468
+ if (queue -> args ) {
469
+ fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
470
+ cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
471
+ }
472
+ #endif
473
+
474
+ #ifdef CONSISTENT_FPCSR
475
+ #ifdef __aarch64__
476
+ __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
477
+ #else
478
+ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
479
+ __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
480
+ #endif
481
+ #endif
482
+
483
+ #ifdef MONITOR
484
+ main_status [cpu ] = MAIN_RUNNING1 ;
485
+ #endif
486
+
487
+ if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
488
+
489
+ if (sb == NULL ) {
490
+ if (!(queue -> mode & BLAS_COMPLEX )){
491
+ #ifdef EXPRECISION
492
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
493
+ sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
494
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
495
+ } else
496
+ #endif
497
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
498
+ #ifdef BUILD_DOUBLE
499
+ sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
500
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
501
+ #endif
502
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
503
+ #ifdef BUILD_SINGLE
504
+ sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
505
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
506
+ #endif
507
+ } else {
508
+ /* Other types in future */
509
+ }
510
+ } else {
511
+ #ifdef EXPRECISION
512
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
513
+ sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
514
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
515
+ } else
516
+ #endif
517
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
518
+ #ifdef BUILD_COMPLEX16
519
+ sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
520
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
521
+ #endif
522
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
523
+ #ifdef BUILD_COMPLEX
524
+ sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
525
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
526
+ #endif
527
+ } else {
528
+ /* Other types in future */
529
+ }
530
+ }
531
+ queue -> sb = sb ;
532
+ }
533
+
534
+ #ifdef MONITOR
535
+ main_status [cpu ] = MAIN_RUNNING2 ;
536
+ #endif
537
+
538
+ if (queue -> mode & BLAS_LEGACY ) {
539
+ legacy_exec (routine , queue -> mode , queue -> args , sb );
540
+ } else
541
+ if (queue -> mode & BLAS_PTHREAD ) {
542
+ void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
543
+ (pthreadcompat )(queue -> args );
544
+ } else
545
+ (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
465
546
466
547
#ifdef SMP_DEBUG
467
548
fprintf (STDERR , "Server[%2ld] Calculation finished!\n" , cpu );
@@ -476,7 +557,7 @@ blas_queue_t *tscq;
476
557
MB ;
477
558
atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )0 );
478
559
479
-
560
+
480
561
}
481
562
482
563
#ifdef MONITOR
@@ -499,6 +580,8 @@ blas_queue_t *tscq;
499
580
fprintf (STDERR , "Server[%2ld] Shutdown!\n" , cpu );
500
581
#endif
501
582
583
+ blas_memory_free (buffer );
584
+
502
585
//pthread_exit(NULL);
503
586
504
587
return NULL ;
@@ -580,9 +663,6 @@ int blas_thread_init(void){
580
663
581
664
LOCK_COMMAND (& server_lock );
582
665
583
- // Adjust thread buffers
584
- adjust_thread_buffers ();
585
-
586
666
if (!blas_server_avail ){
587
667
588
668
thread_timeout_env = openblas_thread_timeout ();
@@ -813,18 +893,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
813
893
fprintf (STDERR , "Exec_blas is called. Number of executing threads : %ld\n" , num );
814
894
#endif
815
895
816
- //Redirect to caller's callback routine
817
- if (openblas_threads_callback_ ) {
818
- int buf_index = 0 ;
819
- #ifndef USE_SIMPLE_THREADED_LEVEL3
820
- for (int i = 0 ; i < num ; i ++ )
821
- queue [i ].position = i ;
822
- #endif
823
- openblas_threads_callback_ (1 , (openblas_dojob_callback ) exec_threads , num , sizeof (blas_queue_t ), (void * ) queue , buf_index );
824
- return 0 ;
825
- }
826
-
827
-
828
896
#ifdef __ELF__
829
897
if (omp_in_parallel && (num > 1 )) {
830
898
if (omp_in_parallel () > 0 ) {
@@ -998,14 +1066,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
998
1066
999
1067
LOCK_COMMAND (& server_lock );
1000
1068
1001
- //Free buffers allocated for threads
1002
- for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
1003
- if (blas_thread_buffer [i ]!= NULL ){
1004
- blas_memory_free (blas_thread_buffer [i ]);
1005
- blas_thread_buffer [i ]= NULL ;
1006
- }
1007
- }
1008
-
1009
1069
if (blas_server_avail ) {
1010
1070
1011
1071
for (i = 0 ; i < blas_num_threads - 1 ; i ++ ) {
@@ -1042,118 +1102,4 @@ int BLASFUNC(blas_thread_shutdown)(void){
1042
1102
return 0 ;
1043
1103
}
1044
1104
1045
- static void adjust_thread_buffers () {
1046
-
1047
- int i = 0 ;
1048
-
1049
- //adjust buffer for each thread
1050
- for (i = 0 ; i < blas_cpu_number ; i ++ ){
1051
- if (blas_thread_buffer [i ] == NULL ){
1052
- blas_thread_buffer [i ] = blas_memory_alloc (2 );
1053
- }
1054
- }
1055
- for (; i < MAX_CPU_NUMBER ; i ++ ){
1056
- if (blas_thread_buffer [i ] != NULL ){
1057
- blas_memory_free (blas_thread_buffer [i ]);
1058
- blas_thread_buffer [i ] = NULL ;
1059
- }
1060
- }
1061
- }
1062
-
1063
- static void exec_threads (int cpu , blas_queue_t * queue , int buf_index )
1064
- {
1065
-
1066
- void * buffer , * sa , * sb ;
1067
-
1068
- buffer = blas_thread_buffer [cpu ];
1069
-
1070
- int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
1071
-
1072
- atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
1073
-
1074
- sa = queue -> sa ;
1075
- sb = queue -> sb ;
1076
-
1077
- #ifdef SMP_DEBUG
1078
- if (queue -> args ) {
1079
- fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
1080
- cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
1081
- }
1082
- #endif
1083
-
1084
- #ifdef CONSISTENT_FPCSR
1085
- #ifdef __aarch64__
1086
- __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
1087
- #else
1088
- __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
1089
- __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
1090
- #endif
1091
- #endif
1092
-
1093
- #ifdef MONITOR
1094
- main_status [cpu ] = MAIN_RUNNING1 ;
1095
- #endif
1096
-
1097
- if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
1098
-
1099
- if (sb == NULL ) {
1100
- if (!(queue -> mode & BLAS_COMPLEX )){
1101
- #ifdef EXPRECISION
1102
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1103
- sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
1104
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1105
- } else
1106
- #endif
1107
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
1108
- #ifdef BUILD_DOUBLE
1109
- sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
1110
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1111
1105
#endif
1112
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1113
- #ifdef BUILD_SINGLE
1114
- sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
1115
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1116
- #endif
1117
- } else {
1118
- /* Other types in future */
1119
- }
1120
- } else {
1121
- #ifdef EXPRECISION
1122
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1123
- sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
1124
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1125
- } else
1126
- #endif
1127
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
1128
- #ifdef BUILD_COMPLEX16
1129
- sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
1130
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1131
- #endif
1132
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1133
- #ifdef BUILD_COMPLEX
1134
- sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
1135
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1136
- #endif
1137
- } else {
1138
- /* Other types in future */
1139
- }
1140
- }
1141
- queue -> sb = sb ;
1142
- }
1143
-
1144
- #ifdef MONITOR
1145
- main_status [cpu ] = MAIN_RUNNING2 ;
1146
- #endif
1147
-
1148
- if (queue -> mode & BLAS_LEGACY ) {
1149
- legacy_exec (routine , queue -> mode , queue -> args , sb );
1150
- } else
1151
- if (queue -> mode & BLAS_PTHREAD ) {
1152
- void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
1153
- (pthreadcompat )(queue -> args );
1154
- } else
1155
- (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
1156
-
1157
- }
1158
-
1159
- #endif
0 commit comments