@@ -344,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
344
344
div_n = (n_to - n_from + DIVIDE_RATE - 1 ) / DIVIDE_RATE ;
345
345
for (js = n_from , bufferside = 0 ; js < n_to ; js += div_n , bufferside ++ ) {
346
346
347
- /* Make sure if no one is using workspace */
348
- START_RPCC ();
349
- for (i = 0 ; i < args -> nthreads ; i ++ )
350
- while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
351
- STOP_RPCC (waiting1 );
352
-
353
347
#if defined(FUSED_GEMM ) && !defined(TIMING )
354
348
355
349
/* Fused operation to copy region of B into workspace and apply kernel */
@@ -387,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
387
381
}
388
382
#endif
389
383
390
- /* Set flag so other threads can access local region of B */
391
- for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ )
384
+ for (i = mypos_n * nthreads_m ; i < (mypos_n + 1 ) * nthreads_m ; i ++ ) {
385
+ /* Make sure if no one is using workspace */
386
+ START_RPCC ();
387
+ while (job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ]) {YIELDING ;MB ;};
388
+ STOP_RPCC (waiting1 );
389
+ /* Set flag so other threads can access local region of B */
392
390
job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
393
- WMB ;
391
+ WMB ;
392
+ }
394
393
}
395
394
396
395
/* Get regions of B from other threads and apply kernel */
@@ -426,13 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
426
425
427
426
/* Clear synchronization flag if this thread is done with other region of B */
428
427
if (m_to - m_from == min_i ) {
429
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
428
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
430
429
WMB ;
431
430
}
432
431
}
433
432
} while (current != mypos );
434
433
435
- /* Iterate through steps of m
434
+ /* Iterate through steps of m
436
435
* Note: First step has already been finished */
437
436
for (is = m_from + min_i ; is < m_to ; is += min_i ){
438
437
min_i = m_to - is ;
@@ -462,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
462
461
sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
463
462
c , ldc , is , js );
464
463
STOP_RPCC (kernel );
465
-
464
+
466
465
#ifdef TIMING
467
466
ops += 2 * min_i * MIN (range_n [current + 1 ] - js , div_n ) * min_l ;
468
467
#endif
469
-
468
+
470
469
/* Clear synchronization flag if this thread is done with region of B */
471
470
if (is + min_i >= m_to ) {
472
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] & = 0 ;
471
+ job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] = 0 ;
473
472
WMB ;
474
473
}
475
474
}
0 commit comments