Skip to content

osc/rdma, btl: fix two issues with one-sided #9594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
19 changes: 16 additions & 3 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,6 @@ struct ompi_osc_rdma_module_t {
/** value of same_size info key for this window */
bool same_size;

/** CPU atomics can be used */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commit message where you introduce the local leader fix appears to have two different commits in it; please properly merge.

bool use_cpu_atomics;

/** passive-target synchronization will not be used in this window */
bool no_locks;

Expand Down Expand Up @@ -261,6 +258,22 @@ struct ompi_osc_rdma_module_t {
/** lock for peer hash table/array */
opal_mutex_t peer_lock;

/** flag to indicate whether all selected btl support remote completion.
*/
bool btl_support_remote_completion;

/** ordering requirement on selected btls.
* If all btls support remote completion, btl_order is MCA_BTL_NO_ORDER;
* otherwise, it will be MCA_BTL_IN_ORDER_RDMA_ATOMICS;
*/
int btl_order;

/** array of peer state. Used when local leader is NOT used */
uintptr_t *peer_state_array;

/** array of peer state header. Used when local leader is NOT used
* and btl requires memory registration */
uint8_t *peer_state_handle_array;

/** BTL(s) in use. Currently this is only used to support RDMA emulation over
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
Expand Down
2 changes: 1 addition & 1 deletion ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -758,7 +758,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,

do {
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
local_handle, target_handle, len, 0, module->btl_order,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
break;
Expand Down
90 changes: 89 additions & 1 deletion ompi/mca/osc/rdma/osc_rdma_active_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *
return ret;
}


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
Expand Down Expand Up @@ -590,6 +591,82 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
return OMPI_SUCCESS;
}

/**
* This function implements a different barrier mechanism for Fence,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

different than what?

* when any of the selected btl does not support remote completion.
* This barrier is based on imposing the MCA_BTL_ORDER_RDMA_ATOMCS
* ordering requirement on seleted btls.
*/
static
int ompi_osc_rdma_fence_barrier_by_ordered_channel (ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_state_t *state = module->state;
ompi_osc_rdma_sync_t *sync = &module->all_sync;
ompi_osc_rdma_peer_t **peers;
ompi_group_t *group;
int num_peers;
int ret;

assert(module->btl_order == MCA_BTL_IN_ORDER_RDMA_ATOMICS);
OPAL_THREAD_LOCK(&module->lock);

if (ompi_comm_size(module->comm) == 1) {
OPAL_THREAD_UNLOCK(&(module->lock));
return OMPI_SUCCESS;
}

ret = ompi_comm_group(module->comm, &group);
if (OMPI_SUCCESS != ret) {
OPAL_THREAD_UNLOCK(&(module->lock));
return ret;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indenting

}

num_peers = sync->num_peers;
assert(ompi_group_size(group) == num_peers);
peers = ompi_osc_rdma_get_peers(module, group);
if (NULL == peers) {
OPAL_THREAD_UNLOCK(&(module->lock));
return OMPI_ERR_OUT_OF_RESOURCE;
}

module->state->num_fenced_peers = 0;
OPAL_THREAD_UNLOCK(&(module->lock));
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
if (ret) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no implicit cast from int to bool

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "barrier failed!");
return ret;
}

/* for each process in the group increment their number of fenced peers */
for (int i = 0 ; i < num_peers; ++i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

be consistent in spacing around ;

ompi_osc_rdma_peer_t *peer = peers[i];
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_fenced_peers);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no space between offseof and argument.


/* the usage of peer local state requires selected btls to support remote completion,
* if that is the case, this function will not have been called
*/
assert (!ompi_osc_rdma_peer_local_state (peer));
ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure that this is sufficient. BTL operations (including RDMA operations) are not implicitly ordered. An ordering flag must be set on every operation that must be ordered in order for this to be guaranteed to work. TCP doesn't take advantage of reordering (and I don't think OFI does either) so you likely won't see this in testing. but the verbs BTL did have an ordering problem and we haven't audited the other BTLs, so we should really follow the spec).

if (OMPI_SUCCESS != ret) {
return ret;
}
}

ompi_osc_rdma_release_peers (peers, num_peers);
ompi_group_free (&group);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "increased fenced_peer counter of all peers");
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "waiting for all peers to increase my counter");
while (num_peers != state->num_fenced_peers) {
ompi_osc_rdma_progress (module);
opal_atomic_mb ();
}

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "received fence message from all peers");
return OMPI_SUCCESS;
}

int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
Expand Down Expand Up @@ -627,7 +704,18 @@ int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);

/* ensure all writes to my memory are complete (both local stores, and RMA operations) */
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
if (module->btl_support_remote_completion) {
/* if all selected btls support remote completion, then all RMA operations have finished
* on remote side. A barrier is enough to complete the fence.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indenting

*/
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
} else {
/*
* if any selected btl does not support remote completion, we will have to send a completion
* message (through the same endpoint of data transfer) to every peer, then wait for a message from every peer.
*/
ret = ompi_osc_rdma_fence_barrier_by_ordered_channel(win);
}

if (mpi_assert & MPI_MODE_NOSUCCEED) {
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly
Expand Down
6 changes: 3 additions & 3 deletions ompi/mca/osc/rdma/osc_rdma_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde

do {
ret = btl->btl_get (btl, endpoint, ptr, aligned_addr,
local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
local_handle, source_handle, aligned_len, 0, module->btl_order,
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
if (!ompi_osc_rdma_oor (ret)) {
break;
Expand Down Expand Up @@ -455,7 +455,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee

do {
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
local_handle, target_handle, size, 0, module->btl_order,
cb, context, cbdata);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
Expand Down Expand Up @@ -705,7 +705,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
do {
ret = btl->btl_get (btl, peer->data_endpoint, ptr,
aligned_source_base, local_handle, source_handle,
aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
aligned_len, 0, module->btl_order, ompi_osc_rdma_get_complete,
request, frag);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
return OMPI_SUCCESS;
Expand Down
Loading