@@ -115,6 +115,8 @@ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
115
115
116
116
int blas_omp_threads_local = 1 ;
117
117
118
+ static void * blas_thread_buffer [MAX_CPU_NUMBER ];
119
+
118
120
/* Local Variables */
119
121
#if defined(USE_PTHREAD_LOCK )
120
122
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER ;
@@ -190,6 +192,10 @@ static int main_status[MAX_CPU_NUMBER];
190
192
BLASLONG exit_time [MAX_CPU_NUMBER ];
191
193
#endif
192
194
195
+ //Prototypes
196
+ static void exec_threads (int , blas_queue_t * , int );
197
+ static void adjust_thread_buffers ();
198
+
193
199
static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
194
200
195
201
if (!(mode & BLAS_COMPLEX )){
@@ -375,7 +381,6 @@ static void* blas_thread_server(void *arg){
375
381
/* Thread identifier */
376
382
BLASLONG cpu = (BLASLONG )arg ;
377
383
unsigned int last_tick ;
378
- void * buffer , * sa , * sb ;
379
384
blas_queue_t * queue ;
380
385
381
386
blas_queue_t * tscq ;
@@ -395,8 +400,6 @@ blas_queue_t *tscq;
395
400
main_status [cpu ] = MAIN_ENTER ;
396
401
#endif
397
402
398
- buffer = blas_memory_alloc (2 );
399
-
400
403
#ifdef SMP_DEBUG
401
404
fprintf (STDERR , "Server[%2ld] Thread has just been spawned!\n" , cpu );
402
405
#endif
@@ -457,92 +460,8 @@ blas_queue_t *tscq;
457
460
#endif
458
461
459
462
if (queue ) {
460
- int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
461
463
462
- atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
463
-
464
- sa = queue -> sa ;
465
- sb = queue -> sb ;
466
-
467
- #ifdef SMP_DEBUG
468
- if (queue -> args ) {
469
- fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
470
- cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
471
- }
472
- #endif
473
-
474
- #ifdef CONSISTENT_FPCSR
475
- #ifdef __aarch64__
476
- __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
477
- #else
478
- __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
479
- __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
480
- #endif
481
- #endif
482
-
483
- #ifdef MONITOR
484
- main_status [cpu ] = MAIN_RUNNING1 ;
485
- #endif
486
-
487
- if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
488
-
489
- if (sb == NULL ) {
490
- if (!(queue -> mode & BLAS_COMPLEX )){
491
- #ifdef EXPRECISION
492
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
493
- sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
494
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
495
- } else
496
- #endif
497
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
498
- #ifdef BUILD_DOUBLE
499
- sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
500
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
501
- #endif
502
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
503
- #ifdef BUILD_SINGLE
504
- sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
505
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
506
- #endif
507
- } else {
508
- /* Other types in future */
509
- }
510
- } else {
511
- #ifdef EXPRECISION
512
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
513
- sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
514
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
515
- } else
516
- #endif
517
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
518
- #ifdef BUILD_COMPLEX16
519
- sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
520
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
521
- #endif
522
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
523
- #ifdef BUILD_COMPLEX
524
- sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
525
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
526
- #endif
527
- } else {
528
- /* Other types in future */
529
- }
530
- }
531
- queue -> sb = sb ;
532
- }
533
-
534
- #ifdef MONITOR
535
- main_status [cpu ] = MAIN_RUNNING2 ;
536
- #endif
537
-
538
- if (queue -> mode & BLAS_LEGACY ) {
539
- legacy_exec (routine , queue -> mode , queue -> args , sb );
540
- } else
541
- if (queue -> mode & BLAS_PTHREAD ) {
542
- void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
543
- (pthreadcompat )(queue -> args );
544
- } else
545
- (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
464
+ exec_threads (cpu , queue , 0 );
546
465
547
466
#ifdef SMP_DEBUG
548
467
fprintf (STDERR , "Server[%2ld] Calculation finished!\n" , cpu );
@@ -557,7 +476,7 @@ blas_queue_t *tscq;
557
476
MB ;
558
477
atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )0 );
559
478
560
-
479
+
561
480
}
562
481
563
482
#ifdef MONITOR
@@ -580,8 +499,6 @@ blas_queue_t *tscq;
580
499
fprintf (STDERR , "Server[%2ld] Shutdown!\n" , cpu );
581
500
#endif
582
501
583
- blas_memory_free (buffer );
584
-
585
502
//pthread_exit(NULL);
586
503
587
504
return NULL ;
@@ -663,6 +580,9 @@ int blas_thread_init(void){
663
580
664
581
LOCK_COMMAND (& server_lock );
665
582
583
+ // Adjust thread buffers
584
+ adjust_thread_buffers ();
585
+
666
586
if (!blas_server_avail ){
667
587
668
588
thread_timeout_env = openblas_thread_timeout ();
@@ -893,6 +813,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
893
813
fprintf (STDERR , "Exec_blas is called. Number of executing threads : %ld\n" , num );
894
814
#endif
895
815
816
+ //Redirect to caller's callback routine
817
+ if (openblas_threads_callback_ ) {
818
+ int buf_index = 0 ;
819
+ #ifndef USE_SIMPLE_THREADED_LEVEL3
820
+ for (int i = 0 ; i < num ; i ++ )
821
+ queue [i ].position = i ;
822
+ #endif
823
+ openblas_threads_callback_ (1 , (openblas_dojob_callback ) exec_threads , num , sizeof (blas_queue_t ), (void * ) queue , buf_index );
824
+ return 0 ;
825
+ }
826
+
827
+
896
828
#ifdef __ELF__
897
829
if (omp_in_parallel && (num > 1 )) {
898
830
if (omp_in_parallel () > 0 ) {
@@ -1066,6 +998,14 @@ int BLASFUNC(blas_thread_shutdown)(void){
1066
998
1067
999
LOCK_COMMAND (& server_lock );
1068
1000
1001
+ //Free buffers allocated for threads
1002
+ for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
1003
+ if (blas_thread_buffer [i ]!= NULL ){
1004
+ blas_memory_free (blas_thread_buffer [i ]);
1005
+ blas_thread_buffer [i ]= NULL ;
1006
+ }
1007
+ }
1008
+
1069
1009
if (blas_server_avail ) {
1070
1010
1071
1011
for (i = 0 ; i < blas_num_threads - 1 ; i ++ ) {
@@ -1102,5 +1042,118 @@ int BLASFUNC(blas_thread_shutdown)(void){
1102
1042
return 0 ;
1103
1043
}
1104
1044
1045
+ static void adjust_thread_buffers () {
1046
+
1047
+ int i = 0 ;
1048
+
1049
+ //adjust buffer for each thread
1050
+ for (i = 0 ; i < blas_cpu_number ; i ++ ){
1051
+ if (blas_thread_buffer [i ] == NULL ){
1052
+ blas_thread_buffer [i ] = blas_memory_alloc (2 );
1053
+ }
1054
+ }
1055
+ for (; i < MAX_CPU_NUMBER ; i ++ ){
1056
+ if (blas_thread_buffer [i ] != NULL ){
1057
+ blas_memory_free (blas_thread_buffer [i ]);
1058
+ blas_thread_buffer [i ] = NULL ;
1059
+ }
1060
+ }
1061
+ }
1062
+
1063
+ static void exec_threads (int cpu , blas_queue_t * queue , int buf_index )
1064
+ {
1065
+
1066
+ void * buffer , * sa , * sb ;
1067
+
1068
+ buffer = blas_thread_buffer [cpu ];
1069
+
1070
+ int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
1071
+
1072
+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
1073
+
1074
+ sa = queue -> sa ;
1075
+ sb = queue -> sb ;
1076
+
1077
+ #ifdef SMP_DEBUG
1078
+ if (queue -> args ) {
1079
+ fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
1080
+ cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
1081
+ }
1082
+ #endif
1083
+
1084
+ #ifdef CONSISTENT_FPCSR
1085
+ #ifdef __aarch64__
1086
+ __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
1087
+ #else
1088
+ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
1089
+ __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
1090
+ #endif
1091
+ #endif
1092
+
1093
+ #ifdef MONITOR
1094
+ main_status [cpu ] = MAIN_RUNNING1 ;
1095
+ #endif
1096
+
1097
+ if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
1098
+
1099
+ if (sb == NULL ) {
1100
+ if (!(queue -> mode & BLAS_COMPLEX )){
1101
+ #ifdef EXPRECISION
1102
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1103
+ sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
1104
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1105
+ } else
1106
+ #endif
1107
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
1108
+ #ifdef BUILD_DOUBLE
1109
+ sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
1110
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1111
+ #endif
1112
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1113
+ #ifdef BUILD_SINGLE
1114
+ sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
1115
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1116
+ #endif
1117
+ } else {
1118
+ /* Other types in future */
1119
+ }
1120
+ } else {
1121
+ #ifdef EXPRECISION
1122
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1123
+ sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
1124
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1125
+ } else
1126
+ #endif
1127
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
1128
+ #ifdef BUILD_COMPLEX16
1129
+ sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
1130
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1105
1131
#endif
1132
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1133
+ #ifdef BUILD_COMPLEX
1134
+ sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
1135
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1136
+ #endif
1137
+ } else {
1138
+ /* Other types in future */
1139
+ }
1140
+ }
1141
+ queue -> sb = sb ;
1142
+ }
1143
+
1144
+ #ifdef MONITOR
1145
+ main_status [cpu ] = MAIN_RUNNING2 ;
1146
+ #endif
1147
+
1148
+ if (queue -> mode & BLAS_LEGACY ) {
1149
+ legacy_exec (routine , queue -> mode , queue -> args , sb );
1150
+ } else
1151
+ if (queue -> mode & BLAS_PTHREAD ) {
1152
+ void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
1153
+ (pthreadcompat )(queue -> args );
1154
+ } else
1155
+ (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
1156
+
1157
+ }
1106
1158
1159
+ #endif
0 commit comments