Skip to content

Commit 689199c

Browse files
committed
ompi request handling race condition fix (MT-case)
Described in #1813
1 parent dac9201 commit 689199c

File tree

1 file changed

+43
-6
lines changed

1 file changed

+43
-6
lines changed

opal/threads/wait_sync.h

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* reserved.
66
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
77
* reserved.
8+
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -13,6 +14,8 @@
1314
*/
1415
#include "opal/sys/atomic.h"
1516
#include "opal/threads/condition.h"
17+
#include "opal/constants.h"
18+
#include "opal/prefetch.h"
1619
#include <pthread.h>
1720

1821
BEGIN_C_DECLS
@@ -33,17 +36,29 @@ typedef struct ompi_wait_sync_t {
3336

3437
#define WAIT_SYNC_RELEASE(sync) \
3538
if (opal_using_threads()) { \
39+
pthread_mutex_lock(&(sync)->lock); \
3640
pthread_cond_destroy(&(sync)->condition); \
41+
pthread_mutex_unlock(&(sync)->lock); \
3742
pthread_mutex_destroy(&(sync)->lock); \
3843
}
3944

40-
#define WAIT_SYNC_SIGNAL(sync) \
45+
#define WAIT_SYNC_LOCK(sync) \
4146
if (opal_using_threads()) { \
42-
pthread_mutex_lock(&(sync->lock)); \
43-
pthread_cond_signal(&sync->condition); \
44-
pthread_mutex_unlock(&(sync->lock)); \
47+
pthread_mutex_lock(&((sync)->lock)); \
4548
}
4649

50+
#define WAIT_SYNC_SIGNAL_UNLOCK(sync) \
51+
if (opal_using_threads()) { \
52+
pthread_cond_signal(&((sync)->condition)); \
53+
pthread_mutex_unlock(&((sync)->lock)); \
54+
}
55+
56+
#define WAIT_SYNC_UNLOCK(sync) \
57+
if (opal_using_threads()) { \
58+
pthread_mutex_unlock(&((sync)->lock)); \
59+
}
60+
61+
4762
OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync);
4863
static inline int sync_wait_st (ompi_wait_sync_t *sync)
4964
{
@@ -75,16 +90,38 @@ static inline int sync_wait_st (ompi_wait_sync_t *sync)
7590
*/
7691
static inline void wait_sync_update(ompi_wait_sync_t *sync, int updates, int status)
7792
{
93+
/* Fast path: if we can decrement the sync->count without
94+
* dropping it to 0 - just return
95+
* Consider that there might be concurrent decrements
96+
*/
97+
if( OPAL_LIKELY(OPAL_SUCCESS == status) ) {
98+
/* we know that our contribution is not yet there
99+
* so we can safely check if the count will still be above 0
100+
* after the change */
101+
while( (sync->count - updates > 0) ){
102+
if( OPAL_ATOMIC_CMPSET_32(&sync->count, sync->count, sync->count - updates) ){
103+
/* fastpath succeeds */
104+
return;
105+
}
106+
}
107+
}
108+
109+
/* Slow path */
110+
WAIT_SYNC_LOCK(sync);
111+
78112
if( OPAL_LIKELY(OPAL_SUCCESS == status) ) {
79113
if( 0 != (OPAL_THREAD_ADD32(&sync->count, -updates)) ) {
80-
return;
114+
goto unlock;
81115
}
82116
} else {
83117
/* this is an error path so just use the atomic */
84118
opal_atomic_swap_32 (&sync->count, 0);
85119
sync->status = OPAL_ERROR;
86120
}
87-
WAIT_SYNC_SIGNAL(sync);
121+
WAIT_SYNC_SIGNAL_UNLOCK(sync);
122+
return;
123+
unlock:
124+
WAIT_SYNC_UNLOCK(sync);
88125
}
89126

90127
END_C_DECLS

0 commit comments

Comments
 (0)