Skip to content

Commit b95b89a

Browse files
committed
osc/rdma: do not use local leader when use btl to update peer state
On each node, there is a local leader, which setup shared memory, and each process that map its state to leader's shared memory. Peer's states are update through a local leader, when peer is on a different node, or peer is on same node but CPU atomics cannot be used. This is incorrect for btl like TCP because TCP requires all communication to the peer through the same orderred channel. This patch make the change that peer's state is updated using the same channel data is transferred. It introduced a new function gather_peer_state_and_handle(), which uses allgather to gather peer's state and memory registration handle (if memory registration is used). Then it sets peer's state and handle using gathered information. Signed-off-by: Wei Zhang <[email protected]>
1 parent b99c1a9 commit b95b89a

File tree

3 files changed

+83
-67
lines changed

3 files changed

+83
-67
lines changed

ompi/mca/osc/rdma/osc_rdma.h

+5
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,11 @@ struct ompi_osc_rdma_module_t {
255255
/** lock for peer hash table/array */
256256
opal_mutex_t peer_lock;
257257

258+
/** array of peers state point */
259+
uintptr_t *peer_state_array;
260+
261+
/** array of peers state's memory reigstration handle */
262+
char *peer_state_handle_array;
258263

259264
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
260265
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this

ompi/mca/osc/rdma/osc_rdma_component.c

+72-27
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,60 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
446446
return OMPI_SUCCESS;
447447
}
448448

449+
/**
450+
* @brief gather information of module state and module state handle inside a shared comm
451+
*
452+
* @param module[in] ompi osc rdma module
453+
* @param peer_state_array
454+
*/
455+
static int gather_peer_state_and_handle(ompi_osc_rdma_module_t *module)
456+
{
457+
int ret, handle_size, comm_size;
458+
459+
comm_size = ompi_comm_size (module->comm);
460+
461+
module->peer_state_array = calloc(comm_size, sizeof(uintptr_t));
462+
if (NULL == module->peer_state_array) {
463+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state array!");
464+
return OMPI_ERR_OUT_OF_RESOURCE;
465+
}
466+
467+
ret = module->comm->c_coll->coll_allgather(&module->state, sizeof(uintptr_t), MPI_BYTE,
468+
module->peer_state_array, sizeof(uintptr_t), MPI_BYTE,
469+
module->comm, module->comm->c_coll->coll_allgather_module);
470+
if (OMPI_SUCCESS != ret) {
471+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret);
472+
return ret;
473+
}
474+
475+
if (module->use_memory_registration) {
476+
handle_size = module->selected_btls[0]->btl_registration_handle_size;
477+
module->peer_state_handle_array = calloc(comm_size, handle_size);
478+
if (NULL == module->peer_state_handle_array) {
479+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state handle array!");
480+
return OMPI_ERR_OUT_OF_RESOURCE;
481+
}
482+
483+
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->state, module->state_size,
484+
MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
485+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
486+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state memory registration failed with ompi error code %d", ret);
487+
return ret;
488+
}
489+
490+
ret = module->comm->c_coll->coll_allgather(module->state_handle, handle_size, MPI_BYTE,
491+
module->peer_state_handle_array, handle_size, MPI_BYTE,
492+
module->comm, module->comm->c_coll->coll_allgather_module);
493+
if (OMPI_SUCCESS != ret) {
494+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret);
495+
return ret;
496+
}
497+
}
498+
499+
return 0;
500+
}
501+
502+
449503
static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size)
450504
{
451505
size_t total_size, local_rank_array_size, leader_peer_data_size;
@@ -491,20 +545,19 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
491545
*base = (void *) ((intptr_t) module->node_comm_info + leader_peer_data_size);
492546
}
493547

494-
/* just go ahead and register the whole segment */
495-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->rank_array, total_size,
496-
MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
497-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
498-
return ret;
499-
}
500-
501548
if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor) {
502549
ret = ompi_osc_rdma_initialize_region (module, base, size);
503550
if (OMPI_SUCCESS != ret) {
504551
return ret;
505552
}
506553
}
507554

555+
ret = gather_peer_state_and_handle(module);
556+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
557+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment");
558+
return ret;
559+
}
560+
508561
ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer);
509562
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
510563
return ret;
@@ -711,16 +764,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
711764
if (0 == local_rank) {
712765
/* unlink the shared memory backing file */
713766
opal_shmem_unlink (&module->seg_ds);
714-
/* just go ahead and register the whole segment */
715-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size,
716-
MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle);
717-
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
718-
state_region->base = (intptr_t) module->segment_base;
719-
if (module->state_handle) {
720-
memcpy (state_region->btl_handle_data, module->state_handle,
721-
module->selected_btls[0]->btl_registration_handle_size);
722-
}
723-
}
724767
}
725768

726769
/* synchronization to make sure memory is registered */
@@ -749,6 +792,11 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
749792
break;
750793
}
751794

795+
ret = gather_peer_state_and_handle(module);
796+
if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) {
797+
break;
798+
}
799+
752800
offset = data_base;
753801
ompi_osc_rdma_peer_t *local_leader;
754802
for (int i = 0 ; i < local_size ; ++i) {
@@ -777,18 +825,15 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
777825
peer->state = (osc_rdma_counter_t) peer_state;
778826
peer->state_endpoint = NULL;
779827
} else {
780-
/* use my endpoint handle to modify the peer's state */
781828
if (module->use_memory_registration) {
782-
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
783-
}
784-
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
785-
if (i==0) {
786-
peer->state_endpoint = peer->data_endpoint;
787-
peer->state_btl_index = peer->data_btl_index;
788-
} else {
789-
peer->state_endpoint = local_leader->state_endpoint;
790-
peer->state_btl_index = local_leader->state_btl_index;
829+
assert(module->peer_state_handle_array);
830+
peer->state_handle = (mca_btl_base_registration_handle_t *)(module->peer_state_handle_array + peer_rank * module->selected_btls[0]->btl_registration_handle_size);
791831
}
832+
833+
assert(NULL != module->peer_state_array);
834+
peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank];
835+
peer->state_endpoint = peer->data_endpoint;
836+
peer->state_btl_index = peer->data_btl_index;
792837
}
793838

794839
if (my_rank == peer_rank) {

ompi/mca/osc/rdma/osc_rdma_peer.c

+6-40
Original file line numberDiff line numberDiff line change
@@ -138,50 +138,16 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd
138138
registration_handle_size = module->selected_btls[0]->btl_registration_handle_size;
139139
}
140140

141-
/* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code
142-
* calculates the node and offset the mapping can be found. once the mapping has been read the state
143-
* part of the peer structure can be initialized. */
144-
node_id = peer->rank / RANK_ARRAY_COUNT(module);
145-
array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size);
146-
147-
/* the node leader rank is stored in the length field */
148-
node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id);
149-
array_index = peer->rank % RANK_ARRAY_COUNT(module);
150-
151-
array_pointer = array_peer_data->base + array_index * sizeof (rank_data);
152-
153-
/* lookup the btl endpoint needed to retrieve the mapping */
154-
ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint);
155-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
156-
return OMPI_ERR_UNREACH;
157-
}
158-
159-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64
160-
", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data));
161-
162-
ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer,
163-
(mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data,
164-
&rank_data, sizeof (rank_data));
165-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
166-
return ret;
167-
}
168-
169-
/* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on
170-
* every node. this gives a good balance of code complexity and memory usage at this time. we take advantage
171-
* of this by re-using the endpoint and pointer stored in the node_comm_info array. */
172-
node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size);
173-
174-
peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank;
141+
assert(NULL != module->peer_stat_array);
142+
peer->state = module->peer_state_array[peer->rank];
175143

176144
if (registration_handle_size) {
177-
peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data;
145+
assert(NULL != module->peer_state_array);
146+
peer->state_handle = (mca_btl_base_registration_handle_t *)(module->peer_state_handle_array + peer->rank * registration_handle_size);
178147
}
179148

180-
ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id),
181-
&peer->state_btl_index, &peer->state_endpoint);
182-
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
183-
return OPAL_ERR_UNREACH;
184-
}
149+
peer->state_btl_index = peer->data_btl_index;
150+
peer->state_endpoint = peer->data_endpoint;
185151

186152
/* nothing more to do for dynamic memory windows */
187153
if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {

0 commit comments

Comments
 (0)