Skip to content

Commit 465af9a

Browse files
committed
osc/rdma : fence mechanism for btls do not support remote completion
Currently, fence is implemented by two steps: First, waiting for RDMA operiatons to complete locally. Second, call coll->barrier on the communicator. This is correct only if selected BTL support remote completion. Otherwise, it can happen that when coll->barrier() finished, the remote side of the RDMA operation has not completed yet. This patch implemented a different barrier mechanism, which is used when any of the selected BTL does not support remote completion. In which case, each process will post an atomic operation to every peer to increase a counter on the peer through the selected BTL endpoint. Though wait for its own counter to reach number of peers. This ensures all previous RDMA operations have completed. Signed-off-by: Wei Zhang <[email protected]>
1 parent ded237a commit 465af9a

7 files changed

+117
-8
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,12 @@ struct ompi_osc_rdma_module_t {
262262
*/
263263
bool btl_support_remote_completion;
264264

265+
/** ordering requirement on selected btls.
266+
* If all btls support remote completion, btl_order is MCA_BTL_NO_ORDER;
267+
* otherwise, it will be MCA_BTL_IN_ORDER_RDMA_ATOMICS;
268+
*/
269+
int btl_order;
270+
265271
/** array of peer state. Used when local leader is NOT used */
266272
uintptr_t *peer_state_array;
267273

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
758758

759759
do {
760760
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
761-
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
761+
local_handle, target_handle, len, 0, module->btl_order,
762762
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
763763
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
764764
break;

ompi/mca/osc/rdma/osc_rdma_active_target.c

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *
357357
return ret;
358358
}
359359

360+
360361
int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t *win)
361362
{
362363
ompi_osc_rdma_module_t *module = GET_MODULE(win);
@@ -590,6 +591,82 @@ int ompi_osc_rdma_test_atomic (ompi_win_t *win, int *flag)
590591
return OMPI_SUCCESS;
591592
}
592593

594+
/**
595+
* This function implements a different barrier mechanism for Fence,
596+
* when any of the selected btl does not support remote completion.
597+
* This barrier is based on imposing the MCA_BTL_ORDER_RDMA_ATOMCS
598+
* ordering requirement on seleted btls.
599+
*/
600+
static
601+
int ompi_osc_rdma_fence_barrier_by_ordered_channel (ompi_win_t *win)
602+
{
603+
ompi_osc_rdma_module_t *module = GET_MODULE(win);
604+
ompi_osc_rdma_state_t *state = module->state;
605+
ompi_osc_rdma_sync_t *sync = &module->all_sync;
606+
ompi_osc_rdma_peer_t **peers;
607+
ompi_group_t *group;
608+
int num_peers;
609+
int ret;
610+
611+
assert(module->btl_order == MCA_BTL_IN_ORDER_RDMA_ATOMICS);
612+
OPAL_THREAD_LOCK(&module->lock);
613+
614+
if (ompi_comm_size(module->comm) == 1) {
615+
OPAL_THREAD_UNLOCK(&(module->lock));
616+
return OMPI_SUCCESS;
617+
}
618+
619+
ret = ompi_comm_group(module->comm, &group);
620+
if (OMPI_SUCCESS != ret) {
621+
OPAL_THREAD_UNLOCK(&(module->lock));
622+
return ret;
623+
}
624+
625+
num_peers = sync->num_peers;
626+
assert(ompi_group_size(group) == num_peers);
627+
peers = ompi_osc_rdma_get_peers(module, group);
628+
if (NULL == peers) {
629+
OPAL_THREAD_UNLOCK(&(module->lock));
630+
return OMPI_ERR_OUT_OF_RESOURCE;
631+
}
632+
633+
module->state->num_fenced_peers = 0;
634+
OPAL_THREAD_UNLOCK(&(module->lock));
635+
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
636+
if (ret) {
637+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "barrier failed!");
638+
return ret;
639+
}
640+
641+
/* for each process in the group increment their number of fenced peers */
642+
for (int i = 0 ; i < num_peers; ++i) {
643+
ompi_osc_rdma_peer_t *peer = peers[i];
644+
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_fenced_peers);
645+
646+
/* the usage of peer local state requires selected btls to support remote completion,
647+
* if that is the case, this function will not have been called
648+
*/
649+
assert (!ompi_osc_rdma_peer_local_state (peer));
650+
ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true);
651+
if (OMPI_SUCCESS != ret) {
652+
return ret;
653+
}
654+
}
655+
656+
ompi_osc_rdma_release_peers (peers, num_peers);
657+
ompi_group_free (&group);
658+
659+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "increased fenced_peer counter of all peers");
660+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "waiting for all peers to increase my counter");
661+
while (num_peers != state->num_fenced_peers) {
662+
ompi_osc_rdma_progress (module);
663+
opal_atomic_mb ();
664+
}
665+
666+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "received fence message from all peers");
667+
return OMPI_SUCCESS;
668+
}
669+
593670
int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
594671
{
595672
ompi_osc_rdma_module_t *module = GET_MODULE(win);
@@ -627,7 +704,18 @@ int ompi_osc_rdma_fence_atomic (int mpi_assert, ompi_win_t *win)
627704
ompi_osc_rdma_sync_rdma_complete (&module->all_sync);
628705

629706
/* ensure all writes to my memory are complete (both local stores, and RMA operations) */
630-
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
707+
if (module->btl_support_remote_completion) {
708+
/* if all selected btls support remote completion, then all RMA operations have finished
709+
* on remote side. A barrier is enough to complete the fence.
710+
*/
711+
ret = module->comm->c_coll->coll_barrier(module->comm, module->comm->c_coll->coll_barrier_module);
712+
} else {
713+
/*
714+
* if any selected btl does not support remote completion, we will have to send a completion
715+
* message (through the same endpoint of data transfer) to every peer, then wait for a message from every peer.
716+
*/
717+
ret = ompi_osc_rdma_fence_barrier_by_ordered_channel(win);
718+
}
631719

632720
if (mpi_assert & MPI_MODE_NOSUCCEED) {
633721
/* as specified in MPI-3 p 438 3-5 the fence can end an epoch. it isn't explicitly

ompi/mca/osc/rdma/osc_rdma_comm.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde
9797

9898
do {
9999
ret = btl->btl_get (btl, endpoint, ptr, aligned_addr,
100-
local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
100+
local_handle, source_handle, aligned_len, 0, module->btl_order,
101101
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
102102
if (!ompi_osc_rdma_oor (ret)) {
103103
break;
@@ -455,7 +455,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
455455

456456
do {
457457
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
458-
local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
458+
local_handle, target_handle, size, 0, module->btl_order,
459459
cb, context, cbdata);
460460
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
461461
return OMPI_SUCCESS;
@@ -705,7 +705,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
705705
do {
706706
ret = btl->btl_get (btl, peer->data_endpoint, ptr,
707707
aligned_source_base, local_handle, source_handle,
708-
aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
708+
aligned_len, 0, module->btl_order, ompi_osc_rdma_get_complete,
709709
request, frag);
710710
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
711711
return OMPI_SUCCESS;

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -926,7 +926,7 @@ static int allocate_module_state(ompi_osc_rdma_module_t *module, void **base, si
926926
return ret;
927927

928928
if (!module->btl_support_remote_completion) {
929-
/* if btl does not support remote_completion, the local leader optimization
929+
/* if any selected btl does not support remote_completion, the local leader optimization
930930
* cannot be used, which means each endpoint will communicate with the peer
931931
* directly to update its state (instead of through the peer's local leader).
932932
* Therefore each endpoint need to have the state pointer of each peer.
@@ -936,6 +936,15 @@ static int allocate_module_state(ompi_osc_rdma_module_t *module, void **base, si
936936
if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
937937
return ret;
938938
}
939+
940+
/* If any selected btl does not support remote completeion, fence cannot use
941+
* the MPI_barrier based implementation, but have to use an implementation that
942+
* is based on orderedd RDMA and ATOMICS operation, for that we have to impose
943+
* the ordering requirement for each RDMA and ATOMIC operation
944+
*/
945+
module->btl_order = MCA_BTL_IN_ORDER_RDMA_ATOMICS;
946+
} else {
947+
module->btl_order = MCA_BTL_NO_ORDER;
939948
}
940949
}
941950

ompi/mca/osc/rdma/osc_rdma_lock.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t
7474
if (NULL != pending_op->op_frag) {
7575
ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer,
7676
(intptr_t) address, pending_op->op_frag->handle, address_handle,
77-
op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
77+
op, operand, flags, module->btl_order, ompi_osc_rdma_atomic_complete,
7878
(void *) pending_op, NULL);
7979
}
8080

@@ -148,7 +148,7 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t
148148
/* spin until the btl has accepted the operation */
149149
do {
150150
ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle,
151-
op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
151+
op, operand, flags, module->btl_order, ompi_osc_rdma_atomic_complete,
152152
(void *) pending_op, NULL);
153153

154154
if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) {

ompi/mca/osc/rdma/osc_rdma_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ struct ompi_osc_rdma_state_t {
136136
osc_rdma_atomic_counter_t num_post_msgs;
137137
/** counter for number of complete messages received */
138138
osc_rdma_counter_t num_complete_msgs;
139+
/** counter for number of fenced peers. This counter is used
140+
* when any seleted btl does NOT support remote complete.
141+
* In which case, a process will increase this counter on
142+
* all its peers, then wait for all its peer to increase
143+
* its counter. */
144+
osc_rdma_counter_t num_fenced_peers;
139145
/** lock for the region state to ensure consistency */
140146
ompi_osc_rdma_lock_t regions_lock;
141147
/** displacement unit for this process */

0 commit comments

Comments
 (0)