Skip to content

Commit 45db363

Browse files
committed
osc/rdma: bug fixes
This commit fixes the following bugs: - Allow a btl to be used for communication if it can communicate with all non-self peers and it supports global atomic visibility. In this case CPU atomics can be used for self and the btl for any other peer. - It was possible to get into a state where different threads of an MPI process could issue conflicting accumulate operations to a remote peer. To eliminate this race we now update the peer flags atomically. - Queue up and re-issue put operations that failed during a BTL callback. This can occur during an accumulate operation. This was an unhandled error case. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 67e26b6 commit 45db363

File tree

5 files changed

+145
-21
lines changed

5 files changed

+145
-21
lines changed

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 100 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2016-2017 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
@@ -18,6 +18,90 @@
1818

1919
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
2020

21+
enum ompi_osc_rdma_event_type_t {
22+
OMPI_OSC_RDMA_EVENT_TYPE_PUT,
23+
};
24+
25+
typedef enum ompi_osc_rdma_event_type_t ompi_osc_rdma_event_type_t;
26+
27+
struct ompi_osc_rdma_event_t {
28+
opal_event_t super;
29+
ompi_osc_rdma_module_t *module;
30+
struct mca_btl_base_endpoint_t *endpoint;
31+
void *local_address;
32+
mca_btl_base_registration_handle_t *local_handle;
33+
uint64_t remote_address;
34+
mca_btl_base_registration_handle_t *remote_handle;
35+
uint64_t length;
36+
mca_btl_base_rdma_completion_fn_t cbfunc;
37+
void *cbcontext;
38+
void *cbdata;
39+
};
40+
41+
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
42+
43+
static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
44+
{
45+
ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
46+
int ret;
47+
48+
ret = event->module->selected_btl->btl_put (event->module->selected_btl, event->endpoint, event->local_address,
49+
event->remote_address, event->local_handle, event->remote_handle,
50+
event->length, 0, MCA_BTL_NO_ORDER, event->cbfunc, event->cbcontext,
51+
event->cbdata);
52+
if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
53+
/* done with this event */
54+
opal_event_del (&event->super);
55+
free (event);
56+
} else {
57+
/* re-activate the event */
58+
opal_event_active (&event->super, OPAL_EV_READ, 1);
59+
}
60+
61+
return NULL;
62+
}
63+
64+
static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
65+
ompi_osc_rdma_event_type_t event_type, void *local_address, mca_btl_base_registration_handle_t *local_handle,
66+
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
67+
uint64_t length, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
68+
void *cbdata)
69+
{
70+
ompi_osc_rdma_event_t *event = malloc (sizeof (*event));
71+
void *(*event_func) (int, int, void *);
72+
73+
if (OPAL_UNLIKELY(NULL == event)) {
74+
return OMPI_ERR_OUT_OF_RESOURCE;
75+
}
76+
77+
event->module = module;
78+
event->endpoint = endpoint;
79+
event->local_address = local_address;
80+
event->local_handle = local_handle;
81+
event->remote_address = remote_address;
82+
event->remote_handle = remote_handle;
83+
event->length = length;
84+
event->cbfunc = cbfunc;
85+
event->cbcontext = cbcontext;
86+
event->cbdata = cbdata;
87+
88+
switch (event_type) {
89+
case OMPI_OSC_RDMA_EVENT_TYPE_PUT:
90+
event_func = ompi_osc_rdma_event_put;
91+
break;
92+
default:
93+
opal_output(0, "osc/rdma: cannot queue unknown event type %d", event_type);
94+
abort ();
95+
}
96+
97+
opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
98+
event_func, event);
99+
opal_event_active (&event->super, OPAL_EV_READ, 1);
100+
101+
return OMPI_SUCCESS;
102+
}
103+
104+
21105
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
22106
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
23107
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
@@ -113,7 +197,7 @@ static void ompi_osc_rdma_acc_put_complete (struct mca_btl_base_module_t *btl, s
113197
}
114198

115199
ompi_osc_rdma_sync_rdma_dec (sync);
116-
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
200+
ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING);
117201
}
118202

119203
/* completion of an accumulate get operation */
@@ -171,7 +255,12 @@ static void ompi_osc_rdma_acc_get_complete (struct mca_btl_base_module_t *btl, s
171255
(mca_btl_base_registration_handle_t *) request->ctx,
172256
request->len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete,
173257
request, NULL);
174-
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
258+
if (OPAL_SUCCESS != status) {
259+
status = ompi_osc_rdma_event_queue (module, endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, (void *) source, local_handle,
260+
request->target_address, (mca_btl_base_registration_handle_t *) request->ctx,
261+
request->len, ompi_osc_rdma_acc_put_complete, request, NULL);
262+
}
263+
175264
assert (OPAL_SUCCESS == status);
176265
}
177266

@@ -203,13 +292,12 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
203292

204293
OPAL_THREAD_LOCK(&module->lock);
205294
/* to ensure order wait until the previous accumulate completes */
206-
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
295+
while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) {
207296
OPAL_THREAD_UNLOCK(&module->lock);
208297
ompi_osc_rdma_progress (module);
209298
OPAL_THREAD_LOCK(&module->lock);
210299
}
211300

212-
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
213301
OPAL_THREAD_UNLOCK(&module->lock);
214302

215303
if (!ompi_osc_rdma_peer_is_exclusive (peer)) {
@@ -847,11 +935,12 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s
847935
ompi_osc_rdma_acc_put_complete, request, NULL);
848936
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
849937
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "could not start put to complete accumulate operation. opal return code "
850-
"%d", ret);
851-
}
938+
"%d. queuing operation...", ret);
852939

853-
/* TODO -- we can do better. probably should queue up the next step and handle it in progress */
854-
assert (OPAL_SUCCESS == ret);
940+
ret = ompi_osc_rdma_event_queue (module, peer->data_endpoint, OMPI_OSC_RDMA_EVENT_TYPE_PUT, local_address, local_handle,
941+
request->target_address, (mca_btl_base_registration_handle_t *) request->ctx, request->len,
942+
ompi_osc_rdma_acc_put_complete, request, NULL);
943+
}
855944

856945
return;
857946
}
@@ -868,7 +957,7 @@ static void ompi_osc_rdma_cas_get_complete (struct mca_btl_base_module_t *btl, s
868957
ompi_osc_rdma_request_complete (request, status);
869958

870959
ompi_osc_rdma_sync_rdma_dec (sync);
871-
peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING;
960+
ompi_osc_rdma_peer_clear_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING);
872961
}
873962

874963
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, const void *compare_addr, void *result_addr,
@@ -894,12 +983,11 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
894983

895984
OPAL_THREAD_LOCK(&module->lock);
896985
/* to ensure order wait until the previous accumulate completes */
897-
while (ompi_osc_rdma_peer_is_accumulating (peer)) {
986+
while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) {
898987
OPAL_THREAD_UNLOCK(&module->lock);
899988
ompi_osc_rdma_progress (module);
900989
OPAL_THREAD_LOCK(&module->lock);
901990
}
902-
peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING;
903991
OPAL_THREAD_UNLOCK(&module->lock);
904992

905993
offset = target_address & btl_alignment_mask;;

ompi/mca/osc/rdma/osc_rdma_comm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc
160160

161161
subreq = NULL;
162162

163-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s)");
163+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "scheduling rdma on non-contiguous datatype(s) or large region");
164164

165165
/* prepare convertors for the source and target. these convertors will be used to determine the
166166
* contiguous segments within the source and target. */

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -850,11 +850,18 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
850850
}
851851

852852
for (int i = 0 ; i < max_btls ; ++i) {
853+
int btl_count = btl_counts[i];
854+
853855
if (NULL == possible_btls[i]) {
854856
break;
855857
}
856858

857-
if (btl_counts[i] == comm_size && possible_btls[i]->btl_latency < selected_latency) {
859+
if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) {
860+
/* do not need to use the btl for self communication */
861+
btl_count++;
862+
}
863+
864+
if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) {
858865
selected_btl = possible_btls[i];
859866
selected_latency = possible_btls[i]->btl_latency;
860867
}

ompi/mca/osc/rdma/osc_rdma_peer.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id,
6161
*peer_out = NULL;
6262

6363
endpoint = ompi_osc_rdma_peer_btl_endpoint (module, peer_id);
64-
if (OPAL_UNLIKELY(NULL == endpoint)) {
64+
if (OPAL_UNLIKELY(NULL == endpoint && !((module->selected_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) &&
65+
peer_id == ompi_comm_rank (module->comm)))) {
6566
return OMPI_ERR_UNREACH;
6667
}
6768

ompi/mca/osc/rdma/osc_rdma_peer.h

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
3+
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* $COPYRIGHT$
66
*
@@ -40,7 +40,7 @@ struct ompi_osc_rdma_peer_t {
4040
int rank;
4141

4242
/** peer flags */
43-
int flags;
43+
volatile int32_t flags;
4444

4545
/** aggregation support */
4646
ompi_osc_rdma_aggregation_t *aggregate;
@@ -188,13 +188,41 @@ static inline bool ompi_osc_rdma_peer_is_exclusive (ompi_osc_rdma_peer_t *peer)
188188
}
189189

190190
/**
191-
* @brief check if this process is currently accumulating on a peer
191+
* @brief try to set a flag on a peer object
192192
*
193-
* @param[in] peer peer object to check
193+
* @param[in] peer peer object to modify
194+
* @param[in] flag flag to set
195+
*
196+
* @returns true if the flag was not already set
197+
* @returns flase otherwise
198+
*/
199+
static inline bool ompi_osc_rdma_peer_test_set_flag (ompi_osc_rdma_peer_t *peer, int flag)
200+
{
201+
int32_t flags;
202+
203+
opal_atomic_mb ();
204+
205+
do {
206+
flags = peer->flags;
207+
if (flags & flag) {
208+
return false;
209+
}
210+
211+
} while (!OPAL_THREAD_BOOL_CMPSET_32 (&peer->flags, flags, flags | flag));
212+
213+
return true;
214+
}
215+
216+
/**
217+
* @brief clear a flag from a peer object
218+
*
219+
* @param[in] peer peer object to modify
220+
* @param[in] flag flag to set
194221
*/
195-
static inline bool ompi_osc_rdma_peer_is_accumulating (ompi_osc_rdma_peer_t *peer)
222+
static inline void ompi_osc_rdma_peer_clear_flag (ompi_osc_rdma_peer_t *peer, int flag)
196223
{
197-
return !!(peer->flags & OMPI_OSC_RDMA_PEER_ACCUMULATING);
224+
OPAL_ATOMIC_AND32(&peer->flags, ~flag);
225+
opal_atomic_mb ();
198226
}
199227

200228
/**

0 commit comments

Comments
 (0)