@@ -115,6 +115,8 @@ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
115
115
116
116
int blas_omp_threads_local = 1 ;
117
117
118
+ static void * blas_thread_buffer [MAX_CPU_NUMBER ];
119
+
118
120
/* Local Variables */
119
121
#if defined(USE_PTHREAD_LOCK )
120
122
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER ;
@@ -190,6 +192,10 @@ static int main_status[MAX_CPU_NUMBER];
190
192
BLASLONG exit_time [MAX_CPU_NUMBER ];
191
193
#endif
192
194
195
+ //Prototypes
196
+ static void exec_threads (int , blas_queue_t * , int );
197
+ static void adjust_thread_buffers ();
198
+
193
199
static void legacy_exec (void * func , int mode , blas_arg_t * args , void * sb ){
194
200
195
201
if (!(mode & BLAS_COMPLEX )){
@@ -375,7 +381,6 @@ static void* blas_thread_server(void *arg){
375
381
/* Thread identifier */
376
382
BLASLONG cpu = (BLASLONG )arg ;
377
383
unsigned int last_tick ;
378
- void * buffer , * sa , * sb ;
379
384
blas_queue_t * queue ;
380
385
381
386
blas_queue_t * tscq ;
@@ -395,8 +400,6 @@ blas_queue_t *tscq;
395
400
main_status [cpu ] = MAIN_ENTER ;
396
401
#endif
397
402
398
- buffer = blas_memory_alloc (2 );
399
-
400
403
#ifdef SMP_DEBUG
401
404
fprintf (STDERR , "Server[%2ld] Thread has just been spawned!\n" , cpu );
402
405
#endif
@@ -456,109 +459,7 @@ blas_queue_t *tscq;
456
459
start = rpcc ();
457
460
#endif
458
461
459
- if (queue ) {
460
- int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
461
-
462
- atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
463
-
464
- sa = queue -> sa ;
465
- sb = queue -> sb ;
466
-
467
- #ifdef SMP_DEBUG
468
- if (queue -> args ) {
469
- fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
470
- cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
471
- }
472
- #endif
473
-
474
- #ifdef CONSISTENT_FPCSR
475
- #ifdef __aarch64__
476
- __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
477
- #else
478
- __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
479
- __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
480
- #endif
481
- #endif
482
-
483
- #ifdef MONITOR
484
- main_status [cpu ] = MAIN_RUNNING1 ;
485
- #endif
486
-
487
- if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
488
-
489
- if (sb == NULL ) {
490
- if (!(queue -> mode & BLAS_COMPLEX )){
491
- #ifdef EXPRECISION
492
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
493
- sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
494
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
495
- } else
496
- #endif
497
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
498
- #ifdef BUILD_DOUBLE
499
- sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
500
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
501
- #endif
502
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
503
- #ifdef BUILD_SINGLE
504
- sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
505
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
506
- #endif
507
- } else {
508
- /* Other types in future */
509
- }
510
- } else {
511
- #ifdef EXPRECISION
512
- if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
513
- sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
514
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
515
- } else
516
- #endif
517
- if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
518
- #ifdef BUILD_COMPLEX16
519
- sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
520
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
521
- #endif
522
- } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
523
- #ifdef BUILD_COMPLEX
524
- sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
525
- + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
526
- #endif
527
- } else {
528
- /* Other types in future */
529
- }
530
- }
531
- queue -> sb = sb ;
532
- }
533
-
534
- #ifdef MONITOR
535
- main_status [cpu ] = MAIN_RUNNING2 ;
536
- #endif
537
-
538
- if (queue -> mode & BLAS_LEGACY ) {
539
- legacy_exec (routine , queue -> mode , queue -> args , sb );
540
- } else
541
- if (queue -> mode & BLAS_PTHREAD ) {
542
- void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
543
- (pthreadcompat )(queue -> args );
544
- } else
545
- (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
546
-
547
- #ifdef SMP_DEBUG
548
- fprintf (STDERR , "Server[%2ld] Calculation finished!\n" , cpu );
549
- #endif
550
-
551
- #ifdef MONITOR
552
- main_status [cpu ] = MAIN_FINISH ;
553
- #endif
554
-
555
- // arm: make sure all results are written out _before_
556
- // thread is marked as done and other threads use them
557
- MB ;
558
- atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )0 );
559
-
560
-
561
- }
462
+ exec_threads (cpu , queue , 0 );
562
463
563
464
#ifdef MONITOR
564
465
main_status [cpu ] = MAIN_DONE ;
@@ -580,8 +481,6 @@ blas_queue_t *tscq;
580
481
fprintf (STDERR , "Server[%2ld] Shutdown!\n" , cpu );
581
482
#endif
582
483
583
- blas_memory_free (buffer );
584
-
585
484
//pthread_exit(NULL);
586
485
587
486
return NULL ;
@@ -663,6 +562,9 @@ int blas_thread_init(void){
663
562
664
563
LOCK_COMMAND (& server_lock );
665
564
565
+ // Adjust thread buffers
566
+ adjust_thread_buffers ();
567
+
666
568
if (!blas_server_avail ){
667
569
668
570
thread_timeout_env = openblas_thread_timeout ();
@@ -893,6 +795,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
893
795
fprintf (STDERR , "Exec_blas is called. Number of executing threads : %ld\n" , num );
894
796
#endif
895
797
798
+ //Redirect to caller's callback routine
799
+ if (openblas_threads_callback_ ) {
800
+ int buf_index = 0 , i = 0 ;
801
+ #ifndef USE_SIMPLE_THREADED_LEVEL3
802
+ for (i = 0 ; i < num ; i ++ )
803
+ queue [i ].position = i ;
804
+ #endif
805
+ openblas_threads_callback_ (1 , (openblas_dojob_callback ) exec_threads , num , sizeof (blas_queue_t ), (void * ) queue , buf_index );
806
+ return 0 ;
807
+ }
808
+
809
+
896
810
#ifdef __ELF__
897
811
if (omp_in_parallel && (num > 1 )) {
898
812
if (omp_in_parallel () > 0 ) {
@@ -1066,6 +980,14 @@ int BLASFUNC(blas_thread_shutdown)(void){
1066
980
1067
981
LOCK_COMMAND (& server_lock );
1068
982
983
+ //Free buffers allocated for threads
984
+ for (i = 0 ; i < MAX_CPU_NUMBER ; i ++ ){
985
+ if (blas_thread_buffer [i ]!= NULL ){
986
+ blas_memory_free (blas_thread_buffer [i ]);
987
+ blas_thread_buffer [i ]= NULL ;
988
+ }
989
+ }
990
+
1069
991
if (blas_server_avail ) {
1070
992
1071
993
for (i = 0 ; i < blas_num_threads - 1 ; i ++ ) {
@@ -1102,5 +1024,132 @@ int BLASFUNC(blas_thread_shutdown)(void){
1102
1024
return 0 ;
1103
1025
}
1104
1026
1105
- #endif
1027
+ static void adjust_thread_buffers () {
1028
+
1029
+ int i = 0 ;
1030
+
1031
+ //adjust buffer for each thread
1032
+ for (i = 0 ; i < blas_cpu_number ; i ++ ){
1033
+ if (blas_thread_buffer [i ] == NULL ){
1034
+ blas_thread_buffer [i ] = blas_memory_alloc (2 );
1035
+ }
1036
+ }
1037
+ for (; i < MAX_CPU_NUMBER ; i ++ ){
1038
+ if (blas_thread_buffer [i ] != NULL ){
1039
+ blas_memory_free (blas_thread_buffer [i ]);
1040
+ blas_thread_buffer [i ] = NULL ;
1041
+ }
1042
+ }
1043
+ }
1044
+
1045
+ static void exec_threads (int cpu , blas_queue_t * queue , int buf_index )
1046
+ {
1047
+
1048
+ if (queue ) {
1049
+ int (* routine )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ) = (int (* )(blas_arg_t * , void * , void * , void * , void * , BLASLONG ))queue -> routine ;
1050
+
1051
+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )1 );
1052
+
1053
+ void * buffer = blas_thread_buffer [cpu ];
1054
+ void * sa = queue -> sa ;
1055
+ void * sb = queue -> sb ;
1056
+
1057
+ #ifdef SMP_DEBUG
1058
+ if (queue -> args ) {
1059
+ fprintf (STDERR , "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n" ,
1060
+ cpu , queue -> mode , queue -> args -> m , queue -> args -> n , queue -> args -> k );
1061
+ }
1062
+ #endif
1063
+
1064
+ #ifdef CONSISTENT_FPCSR
1065
+ #ifdef __aarch64__
1066
+ __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode ));
1067
+ #else
1068
+ __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode ));
1069
+ __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode ));
1070
+ #endif
1071
+ #endif
1072
+
1073
+ #ifdef MONITOR
1074
+ main_status [cpu ] = MAIN_RUNNING1 ;
1075
+ #endif
1076
+
1077
+ if (sa == NULL ) sa = (void * )((BLASLONG )buffer + GEMM_OFFSET_A );
1078
+
1079
+ if (sb == NULL ) {
1080
+ if (!(queue -> mode & BLAS_COMPLEX )){
1081
+ #ifdef EXPRECISION
1082
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1083
+ sb = (void * )(((BLASLONG )sa + ((QGEMM_P * QGEMM_Q * sizeof (xdouble )
1084
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1085
+ } else
1086
+ #endif
1087
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ) {
1088
+ #ifdef BUILD_DOUBLE
1089
+ sb = (void * )(((BLASLONG )sa + ((DGEMM_P * DGEMM_Q * sizeof (double )
1090
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1091
+ #endif
1092
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1093
+ #ifdef BUILD_SINGLE
1094
+ sb = (void * )(((BLASLONG )sa + ((SGEMM_P * SGEMM_Q * sizeof (float )
1095
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1096
+ #endif
1097
+ } else {
1098
+ /* Other types in future */
1099
+ }
1100
+ } else {
1101
+ #ifdef EXPRECISION
1102
+ if ((queue -> mode & BLAS_PREC ) == BLAS_XDOUBLE ){
1103
+ sb = (void * )(((BLASLONG )sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof (xdouble )
1104
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1105
+ } else
1106
+ #endif
1107
+ if ((queue -> mode & BLAS_PREC ) == BLAS_DOUBLE ){
1108
+ #ifdef BUILD_COMPLEX16
1109
+ sb = (void * )(((BLASLONG )sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof (double )
1110
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1111
+ #endif
1112
+ } else if ((queue -> mode & BLAS_PREC ) == BLAS_SINGLE ) {
1113
+ #ifdef BUILD_COMPLEX
1114
+ sb = (void * )(((BLASLONG )sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof (float )
1115
+ + GEMM_ALIGN ) & ~GEMM_ALIGN )) + GEMM_OFFSET_B );
1116
+ #endif
1117
+ } else {
1118
+ /* Other types in future */
1119
+ }
1120
+ }
1121
+ queue -> sb = sb ;
1122
+ }
1123
+
1124
+ #ifdef MONITOR
1125
+ main_status [cpu ] = MAIN_RUNNING2 ;
1126
+ #endif
1127
+
1128
+ if (queue -> mode & BLAS_LEGACY ) {
1129
+ legacy_exec (routine , queue -> mode , queue -> args , sb );
1130
+ } else
1131
+ if (queue -> mode & BLAS_PTHREAD ) {
1132
+ void (* pthreadcompat )(void * ) = (void (* )(void * ))queue -> routine ;
1133
+ (pthreadcompat )(queue -> args );
1134
+ } else
1135
+ (routine )(queue -> args , queue -> range_m , queue -> range_n , sa , sb , queue -> position );
1136
+
1137
+ #ifdef SMP_DEBUG
1138
+ fprintf (STDERR , "Server[%2ld] Calculation finished!\n" , cpu );
1139
+ #endif
1140
+
1141
+ #ifdef MONITOR
1142
+ main_status [cpu ] = MAIN_FINISH ;
1143
+ #endif
1144
+
1145
+ // arm: make sure all results are written out _before_
1146
+ // thread is marked as done and other threads use them
1147
+ MB ;
1148
+ atomic_store_queue (& thread_status [cpu ].queue , (blas_queue_t * )0 );
1149
+
1150
+
1151
+ }
1152
+
1153
+ }
1106
1154
1155
+ #endif
0 commit comments