From 09f8f750ad1a795c36198a2260acda90798be4d9 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Fri, 17 Sep 2021 17:27:04 +0000 Subject: [PATCH 1/9] btl/base,ofi: adjust the MCA_BTL_ATOMIC_SUPPORTS_GLOB flag The MCA_BTL_ATOMIC_SUPPORTS_GLOB flag indicates whether a btl atomics can be mixed with CPU's atomics operations. The active message RDMA does support this capability, therefore this patch unset the flag for active message RDMA. btl/ofi's RDMA does support this capability because it requested delivery complete from libfabric, therefore this patch set the flag for btl/ofi. Signed-off-by: Wei Zhang --- opal/mca/btl/base/btl_base_am_rdma.c | 2 +- opal/mca/btl/ofi/btl_ofi_module.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 4feeff87c24..474dd1803c3 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -1114,7 +1114,7 @@ int mca_btl_base_am_rdma_init(mca_btl_base_module_t *btl) /* emulated RDMA atomics can support the full range of atomics. for * now only a handful are supported. */ - btl->btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB | MCA_BTL_ATOMIC_SUPPORTS_CSWAP + btl->btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT | MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_SWAP diff --git a/opal/mca/btl/ofi/btl_ofi_module.c b/opal/mca/btl/ofi/btl_ofi_module.c index cffa0c27317..bd0b492bddb 100644 --- a/opal/mca/btl/ofi/btl_ofi_module.c +++ b/opal/mca/btl/ofi/btl_ofi_module.c @@ -393,7 +393,8 @@ mca_btl_ofi_module_t *mca_btl_ofi_module_alloc(int mode) module->super.btl_flags |= MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA; - module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_SWAP + module->super.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB + | MCA_BTL_ATOMIC_SUPPORTS_ADD | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_CSWAP | MCA_BTL_ATOMIC_SUPPORTS_32BIT; From c8495bdda7cc7f19c770a6308f02832eefac4ee6 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 20 Sep 2021 21:59:09 +0000 Subject: [PATCH 2/9] osc/rdma: fix bugs in evaluating using_cpu_atomics was The flag "using_cpu_atomics" indicates whether CPU atomcis can be use to update counters for osc/rdma peers on the same instance. This patch fixed two bugs in the way "using_cpu_atomics" is evaluated in allocate_state_shared(): First, current code used "btl->btl_flags", which is a typo. The right fla to use is "btl->btl_atomic_flags". Second, current code always set "using_cpu_atomics" to true on single node, which is not right because even on single node, the selected btl does not necessary support mixed use of atomics. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma_component.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index a2e8dca6fa7..f8dfb6a33de 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -594,10 +594,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->single_node = local_size == global_size; module->use_cpu_atomics = module->single_node; - if (!module->single_node) { - for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } + for (int i = 0 ; i < module->btls_in_use ; ++i) { + module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); } if (1 == local_size) { From 131bb6769b4233ee194193232aa8b348e069514d Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Tue, 21 Sep 2021 01:03:36 +0000 Subject: [PATCH 3/9] osc/rdma: fix the fail condition in osc_rdma_new_peer() osc_rdma_new_peer() creates a new peer. It should fail when the following 3 conditions are all met: 1. cannot find an endpoint 2. the selected btl does not support atomic global 3. the peer is not self However, the current code fails when either 2 or 3 is met. This patch fixed it. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma_peer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..38d605de85a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -84,8 +84,9 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, /* find a btl/endpoint to use for this peer */ int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !((module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - peer_id == ompi_comm_rank (module->comm)))) { + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && + !(module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && + (peer_id != ompi_comm_rank (module->comm)))) { return ret; } From 4a845a94507c65a6e1223ed2eb3cc574a082ed68 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Tue, 21 Sep 2021 01:12:03 +0000 Subject: [PATCH 4/9] osc/rdma: do not use local leader optimization for active message RDMA The local leader optimization means that: on each node, a process was designated as the local leader, who setup shared memory, and other processes on the same node would map their states to local leader's shared memory. When a process try to update a peer process's state, the process will do that through atomic actions on local leader's memory. The peer's state is then updated through shard memory. Essentially, using local leader optimization means two different channels are used to transfer data and to update peer's update. This optimization is incorrect for BTL that uses active message RDMA . Active message RDMA uses send/receive to emulate put and atomics, and its put implementation does not delivery complete, e.g. when the initiator got completion for a put action, it only means data has been sent. it does not mean the data has been delivered to the target buffer. Therefore, if peer's state is updated through a different communication channel, it can happen that peer's state is updated before the put action is completed on the peer, which will cause data corruption. This patch made the change that: for active message RDMA, peer's state is updated using the same channel data is transferred (e.g diabling the local leader optimization). To achieve that, each process need to have the pointer to each peer's state, for which this patch introduced a function gather_peer_state(). Note because active message RDMA does not use memory registration, the state_handle is not gathered. This patch then sets peer's state pointer using gathered information, and use the same endpoint to update data and transfer data. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma.h | 15 +++++ ompi/mca/osc/rdma/osc_rdma_component.c | 88 ++++++++++++++++++++++--- ompi/mca/osc/rdma/osc_rdma_peer.c | 89 +++++++++++++++----------- 3 files changed, 145 insertions(+), 47 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 6a58c19f86c..19a2f577d19 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -255,6 +255,21 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /** flag to indicate wether to use the local leader optimization, + * in which on each node a process was designated as local leader. + * local leader setup a shared memory region, and all other processes + * on the same node map their state to that region. When a process + * want to update a peer's state, the process uses atomics on the peer's + * local leader to update peer's state through shared memory region. + * BTLs that uses active message RDMA cannot support such optimization, + * because active message RDMA uses send/receive to emulate put and + * atomics, so the atomcis and RMA operation must be through the same + * ordered channel. + */ + bool use_local_leader; + + /** array of peer state. Used when local leader is NOT used */ + uintptr_t *peer_state_array; /** BTL(s) in use. Currently this is only used to support RDMA emulation over * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index f8dfb6a33de..f074c65895d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -446,6 +446,47 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_SUCCESS; } +/** + * @brief gather pointer of module->state and inside the world comm + * + * This function is used when local leader optimization is NOT used. + * In which case, each process communicate with its peer directly + * to update peer's state counters (instead of communicating with the peer's + * local leader), therefore it need the pointer of peer's state counters, so + * it can use atomics to update the counters. + * + * Note state_handle is not gathered because local leader optimization + * is NOT used only when active message RDMA is used, and active message + * RDMA does not need memory registration. + * + * @param module[in,out] ompi osc rdma module + */ +static int gather_peer_state(ompi_osc_rdma_module_t *module) +{ + int ret, comm_size; + + comm_size = ompi_comm_size (module->comm); + + module->peer_state_array = calloc(comm_size, sizeof(uintptr_t)); + if (NULL == module->peer_state_array) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate memory for module state array!"); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + ret = module->comm->c_coll->coll_allgather(&module->state, sizeof(uintptr_t), MPI_BYTE, + module->peer_state_array, sizeof(uintptr_t), MPI_BYTE, + module->comm, module->comm->c_coll->coll_allgather_module); + if (OMPI_SUCCESS != ret) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "module state allgather failed with ompi error code %d", ret); + return ret; + } + + assert (!module->use_memory_registration); + + return 0; +} + + static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size) { size_t total_size, local_rank_array_size, leader_peer_data_size; @@ -505,6 +546,13 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s } } + if (!module->use_local_leader) { + ret = gather_peer_state(module); + if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) { + return ret; + } + } + ret = ompi_osc_rdma_new_peer (module, my_rank, &my_peer); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -593,9 +641,14 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; module->use_cpu_atomics = module->single_node; - + module->use_local_leader = true; for (int i = 0 ; i < module->btls_in_use ; ++i) { module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + /* the usage of local leader means to use different channels to send data to peer and update peer's state. + * When different channels are used, active message RDMA cannot guarantee that put and atomics are completed + * in the same order. + */ + module->use_local_leader = module->use_local_leader && ! (module->selected_btls[i]->btl_flags &(MCA_BTL_FLAGS_PUT_AM | MCA_BTL_FLAGS_ATOMIC_AM_FOP)); } if (1 == local_size) { @@ -749,6 +802,13 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s break; } + if (!module->use_local_leader) { + ret = gather_peer_state(module); + if (OPAL_UNLIKELY(OMPI_SUCCESS !=ret)) { + break; + } + } + offset = data_base; ompi_osc_rdma_peer_t *local_leader; for (int i = 0 ; i < local_size ; ++i) { @@ -777,17 +837,25 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s peer->state = (osc_rdma_counter_t) peer_state; peer->state_endpoint = NULL; } else { - /* use my endpoint handle to modify the peer's state */ - if (module->use_memory_registration) { - peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data; - } - peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); - if (i==0) { + + if (module->use_local_leader) { + if (module->use_memory_registration) { + peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data; + } + peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); + if (i==0) { + peer->state_endpoint = peer->data_endpoint; + peer->state_btl_index = peer->data_btl_index; + } else { + peer->state_endpoint = local_leader->state_endpoint; + peer->state_btl_index = local_leader->state_btl_index; + } + } else { + assert (!module->use_memory_registration); + assert (NULL != module->peer_state_array); + peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank]; peer->state_endpoint = peer->data_endpoint; peer->state_btl_index = peer->data_btl_index; - } else { - peer->state_endpoint = local_leader->state_endpoint; - peer->state_btl_index = local_leader->state_btl_index; } } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 38d605de85a..41e90072c0c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -138,49 +138,64 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd registration_handle_size = module->selected_btls[0]->btl_registration_handle_size; } - /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code - * calculates the node and offset the mapping can be found. once the mapping has been read the state - * part of the peer structure can be initialized. */ - node_id = peer->rank / RANK_ARRAY_COUNT(module); - array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size); - - /* the node leader rank is stored in the length field */ - node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id); - array_index = peer->rank % RANK_ARRAY_COUNT(module); - - array_pointer = array_peer_data->base + array_index * sizeof (rank_data); - - /* lookup the btl endpoint needed to retrieve the mapping */ - ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return OMPI_ERR_UNREACH; - } + if (module->use_local_leader) { + /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code + * calculates the node and offset the mapping can be found. once the mapping has been read the state + * part of the peer structure can be initialized. */ + node_id = peer->rank / RANK_ARRAY_COUNT(module); + array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size); + + /* the node leader rank is stored in the length field */ + node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id); + array_index = peer->rank % RANK_ARRAY_COUNT(module); + + array_pointer = array_peer_data->base + array_index * sizeof (rank_data); + + /* lookup the btl endpoint needed to retrieve the mapping */ + ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return OMPI_ERR_UNREACH; + } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 - ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 + ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); - ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer, - (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, - &rank_data, sizeof (rank_data)); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer, + (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, + &rank_data, sizeof (rank_data)); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } - /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on - * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage - * of this by re-using the endpoint and pointer stored in the node_comm_info array. */ - node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size); + /* initialize the state part of the peer object. NTH: for now the state data is for every node is stored on + * every node. this gives a good balance of code complexity and memory usage at this time. we take advantage + * of this by re-using the endpoint and pointer stored in the node_comm_info array. */ + node_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + rank_data.node_id * module->region_size); - peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank; + peer->state = node_peer_data->base + module->state_offset + module->state_size * rank_data.rank; - if (registration_handle_size) { - peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data; - } + if (registration_handle_size) { + peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data; + } - ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id), - &peer->state_btl_index, &peer->state_endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return OPAL_ERR_UNREACH; + ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id), + &peer->state_btl_index, &peer->state_endpoint); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return OPAL_ERR_UNREACH; + } + } else { + assert(NULL != module->peer_state_array); + peer->state = module->peer_state_array[peer->rank]; + + assert(!module->use_memory_registration); + peer->state_handle = NULL; + + /* when local leader optimization is not used, + * same endpoint were used to transfer data and + * update state + */ + peer->state_btl_index = peer->data_btl_index; + peer->state_endpoint = peer->data_endpoint; } /* nothing more to do for dynamic memory windows */ From 3c0ae03e48dd543ca941ac29e26953e4c89b2158 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Sat, 25 Sep 2021 14:32:36 +0000 Subject: [PATCH 5/9] osc/rdma: add pid to shm file name Current shm file name used by osc/rdma is not unique when multiple communicator are on the same instance. This patch add the local leader's pid to the file name to address the issue. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma_component.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index f074c65895d..10da47f3053 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -701,9 +701,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (0 == local_rank) { /* allocate the shared memory segment */ - ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d", + ret = opal_asprintf (&data_file, "%s" OPAL_PATH_SEP "osc_rdma.%s.%x.%d.%d", mca_osc_rdma_component.backing_directory, ompi_process_info.nodename, - OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm)); + OMPI_PROC_MY_NAME->jobid, ompi_comm_get_cid(module->comm), getpid()); if (0 > ret) { ret = OMPI_ERR_OUT_OF_RESOURCE; } else { From 6c4cb758da7b2d4ddaa2e9b241988bd0773eaa80 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Sun, 26 Sep 2021 03:31:27 +0000 Subject: [PATCH 6/9] osc/rdma: use pointer to btl to replace btl_index in peer Currently, each peer keep a state_btl_index and a data_btl_index. The btl_index is used to retrive btl used for data transfer and state updating, by calling function mpi_osc_rdma_selected_btl(). This patch simplify the code by directly storing the pointer to the btl inside peer, thus bypassing the call to mpi_osc_rdma_selected_btl(). The function mpi_osc_rdma_selected_btl() is then removed. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma.h | 5 ---- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 22 ++++++++-------- ompi/mca/osc/rdma/osc_rdma_comm.c | 14 +++++----- ompi/mca/osc/rdma/osc_rdma_comm.h | 4 ++- ompi/mca/osc/rdma/osc_rdma_component.c | 10 ++++---- ompi/mca/osc/rdma/osc_rdma_dynamic.c | 4 +-- ompi/mca/osc/rdma/osc_rdma_lock.h | 20 +++++++-------- ompi/mca/osc/rdma/osc_rdma_peer.c | 34 +++++++++++++------------ ompi/mca/osc/rdma/osc_rdma_peer.h | 13 ++++++---- 9 files changed, 64 insertions(+), 62 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 19a2f577d19..f2243180073 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -645,11 +645,6 @@ static inline bool ompi_osc_rdma_oor (int rc) return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc)); } -__opal_attribute_always_inline__ -static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { - return module->selected_btls[btl_index]; -} - __opal_attribute_always_inline__ static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { if(btl_index == module->selected_btls_size) { diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 15f0a80714e..6824668beb1 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -156,7 +156,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *selected_btl = peer->data_btl; int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; @@ -176,7 +176,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0]; - return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, + return ompi_osc_rdma_btl_fop (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, result_addr, true, NULL, NULL, NULL); } @@ -198,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap"); - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8); + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, address, target_handle, &old_value, 8); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -213,7 +213,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt); } - ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, + ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, address, target_handle, old_value, new_value, 0, (int64_t*)&new_value); if (OPAL_SUCCESS != ret || new_value == old_value) { break; @@ -234,7 +234,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *selected_btl = peer->data_btl; int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; @@ -262,7 +262,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo *((int64_t *) origin_addr)); /* if we locked the peer its best to wait for completion before returning */ - return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, + return ompi_osc_rdma_btl_op (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags, true, NULL, NULL, NULL); } @@ -375,7 +375,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v /* set up the request */ request->to_free = ptr; - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, ptr, len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; @@ -661,7 +661,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; @@ -679,7 +679,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%" PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr)); - ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, + ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, compare, source, flags, result_addr); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired); @@ -715,7 +715,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -728,7 +728,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, ", sync %p", len, target_address, (void *) sync); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get..."); - ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address, + ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, result_addr, len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..46a012da5dc 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -54,11 +54,11 @@ static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struc ((bool *) context)[0] = true; } -int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index, +int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -444,7 +444,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -481,7 +481,7 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -600,7 +600,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -736,7 +736,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -779,7 +779,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + mca_btl_base_module_t *btl = peer->data_btl; mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index efb305a571e..6427272a450 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -103,6 +103,7 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or * @brief read data from a remote memory region (blocking) * * @param[in] module osc rdma module + * @param[in] btl btl module * @param[in] endpoint btl endpoint * @param[in] source_address remote address to read from * @param[in] source_handle btl registration handle for remote region (must be valid for the entire region) @@ -113,7 +114,8 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or * data that is stored on the remote peer. The peer object does not have to be fully initialized to * work. Only the btl endpoint is needed. */ -int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index, +int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len); diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 10da47f3053..80a8c7da23d 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -575,7 +575,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s } else { /* use my endpoint handle to modify the peer's state */ my_peer->state_handle = module->state_handle; - my_peer->state_btl_index = my_peer->data_btl_index; + my_peer->state_btl = my_peer->data_btl; my_peer->state_endpoint = my_peer->data_endpoint; } @@ -845,17 +845,17 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i); if (i==0) { peer->state_endpoint = peer->data_endpoint; - peer->state_btl_index = peer->data_btl_index; + peer->state_btl = peer->data_btl; } else { peer->state_endpoint = local_leader->state_endpoint; - peer->state_btl_index = local_leader->state_btl_index; + peer->state_btl = local_leader->state_btl; } } else { assert (!module->use_memory_registration); assert (NULL != module->peer_state_array); peer->state = (osc_rdma_counter_t)module->peer_state_array[peer_rank]; peer->state_endpoint = peer->data_endpoint; - peer->state_btl_index = peer->data_btl_index; + peer->state_btl = peer->data_btl; } } @@ -867,7 +867,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s !module->use_cpu_atomics && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; - peer->data_btl_index = local_leader->data_btl_index; + peer->data_btl = local_leader->data_btl; } ompi_osc_module_add_peer (module, peer); diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index 8adfa7f8159..73e80eb8dc7 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -392,7 +392,7 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, osc_rdma_counter_t remote_value; source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, region_count); - ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->super.state_btl, peer->super.state_endpoint, source_address, peer->super.state_handle, &remote_value, sizeof (remote_value)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -433,7 +433,7 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module, OMPI_OSC_RDMA_LOCK_EXCLUSIVE); source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions); - ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->super.state_btl, peer->super.state_endpoint, source_address, peer->super.state_handle, peer->regions, region_len); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..0ffdcde190f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -37,14 +37,14 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b void *context, void *data, int status); __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op, int64_t operand, int flags, int64_t *result, const bool wait_for_completion, ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret = OPAL_ERROR; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -110,23 +110,23 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, const bool wait_for_completion) { - return ompi_osc_rdma_btl_fop (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, op, + return ompi_osc_rdma_btl_fop (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, op, operand, 0, result, wait_for_completion, NULL, NULL, NULL); } __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int op, int64_t operand, int flags, const bool wait_for_completion, ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, + return ompi_osc_rdma_btl_fop (module, selected_btl, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } @@ -181,18 +181,18 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) { - return ompi_osc_rdma_btl_op (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, op, + return ompi_osc_rdma_btl_op (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, op, operand, 0, wait_for_completion, NULL, NULL, NULL); } __opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8_t btl_index, +static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, + struct mca_btl_base_module_t *selected_btl, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, mca_btl_base_registration_handle_t *address_handle, int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -244,7 +244,7 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) { - return ompi_osc_rdma_btl_cswap (module, peer->state_btl_index, peer->state_endpoint, address, peer->state_handle, compare, value, + return ompi_osc_rdma_btl_cswap (module, peer->state_btl, peer->state_endpoint, address, peer->state_handle, compare, value, 0, result); } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 41e90072c0c..30592d873d2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -30,13 +30,15 @@ * * @param[in] module osc rdma module * @param[in] peer_id process rank in the module communicator - * @param[in] module_btl_index btl index to use + * @param[in] btl_out btl to be used + * @param[in] endpoint endpoint to be used * - * @returns NULL on error - * @returns btl endpoint on success + * @returns OMPI_SUCCESS on success + * @returns ompi error code on error */ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *module, - int peer_id, uint8_t *btl_index_out, + int peer_id, + struct mca_btl_base_module_t **btl_out, struct mca_btl_base_endpoint_t **endpoint) { ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id); @@ -51,7 +53,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; + *btl_out = bml_endpoint->btl_rdma.bml_btls[btl_index].btl; *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } @@ -64,7 +66,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; + *btl_out = bml_endpoint->btl_eager.bml_btls[btl_index].btl; *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } @@ -77,13 +79,13 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) { struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_base_module_t *btl; ompi_osc_rdma_peer_t *peer; - uint8_t module_btl_index = UINT8_MAX; *peer_out = NULL; /* find a btl/endpoint to use for this peer */ - int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); + int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &btl, &endpoint); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !(module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && (peer_id != ompi_comm_rank (module->comm)))) { @@ -100,7 +102,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, } peer->data_endpoint = endpoint; - peer->data_btl_index = module_btl_index; + peer->data_btl = btl; peer->rank = peer_id; *peer_out = peer; @@ -128,7 +130,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd ompi_osc_rdma_rank_data_t rank_data; int registration_handle_size = 0; int node_id, node_rank, array_index; - uint8_t array_btl_index; + struct mca_btl_base_module_t *array_btl; int ret, disp_unit; char *peer_data; @@ -152,7 +154,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd array_pointer = array_peer_data->base + array_index * sizeof (rank_data); /* lookup the btl endpoint needed to retrieve the mapping */ - ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl_index, &array_endpoint); + ret = ompi_osc_rdma_peer_btl_endpoint (module, node_rank, &array_btl, &array_endpoint); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_UNREACH; } @@ -160,7 +162,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); - ret = ompi_osc_get_data_blocking (module, array_btl_index, array_endpoint, array_pointer, + ret = ompi_osc_get_data_blocking (module, array_btl, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, &rank_data, sizeof (rank_data)); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -179,7 +181,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd } ret = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id), - &peer->state_btl_index, &peer->state_endpoint); + &peer->state_btl, &peer->state_endpoint); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OPAL_ERR_UNREACH; } @@ -194,7 +196,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd * same endpoint were used to transfer data and * update state */ - peer->state_btl_index = peer->data_btl_index; + peer->state_btl = peer->data_btl; peer->state_endpoint = peer->data_endpoint; } @@ -215,7 +217,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd peer_data = alloca (peer_data_size); /* read window data from the end of the target's state structure */ - ret = ompi_osc_get_data_blocking (module, peer->state_btl_index, peer->state_endpoint, + ret = ompi_osc_get_data_blocking (module, peer->state_btl, peer->state_endpoint, peer->state + peer_data_offset, peer->state_handle, peer_data, peer_data_size); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { @@ -264,7 +266,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.super.data_endpoint = ex_peer->super.super.state_endpoint; - ex_peer->super.super.data_btl_index = ex_peer->super.super.state_btl_index; + ex_peer->super.super.data_btl = ex_peer->super.super.state_btl; } } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.h b/ompi/mca/osc/rdma/osc_rdma_peer.h index ef8f8e0605c..86a717babf6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.h +++ b/ompi/mca/osc/rdma/osc_rdma_peer.h @@ -45,11 +45,14 @@ struct ompi_osc_rdma_peer_t { /** peer flags */ opal_atomic_int32_t flags; - /** index into BTL array */ - uint8_t data_btl_index; - - /** index into BTL array */ - uint8_t state_btl_index; + /** btl used for rdma */ + struct mca_btl_base_module_t *data_btl; + + /** btl used for reading/modifying peer state. + * When the local leader optimization is used, + * peer state are read/modified through a different + * btl then the one used for rdma data */ + struct mca_btl_base_module_t *state_btl; }; typedef struct ompi_osc_rdma_peer_t ompi_osc_rdma_peer_t; From b0197d91d828401ba0b7b8593e026f2215cb70eb Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Sun, 26 Sep 2021 03:37:20 +0000 Subject: [PATCH 7/9] osc/rdma: use btl/self for self communication as last resort ompi_osc_rdma_peer_btl_endpoint() is used to select btl and endpoint to communicate with a peer. This patch added a change to ompi_osc_rdma_peer_btl_endpoint() that: if no btl/endpoint has been selected for self communication, and if bml has the btl/self, then use btl/self for self communication. It also made a change to ompi_osc_rdma_new_peer(): Currently if no btl can be found and the peer is self, the function still continues. This patch made the function to fail in this case, because the ability to do self communication is essential for osc/rdma. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma_peer.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index 30592d873d2..2543ef4eecb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -73,7 +73,19 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul } } - /* unlikely but can happen when creating a peer for self */ + if (peer_id == ompi_comm_rank (module->comm)) { + for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { + struct mca_btl_base_module_t *btl; + + btl = bml_endpoint->btl_rdma.bml_btls[btl_index].btl; + if (strcmp(btl->btl_component->btl_version.mca_component_name, "self")==0) { + *btl_out = btl; + *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; + return OMPI_SUCCESS; + } + } + } + return OMPI_ERR_UNREACH; } @@ -86,9 +98,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, /* find a btl/endpoint to use for this peer */ int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &btl, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && - !(module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - (peer_id != ompi_comm_rank (module->comm)))) { + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } From ef501d88a0b5a320ec2b54b5ee3ee944c55212a0 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 27 Sep 2021 17:10:38 +0000 Subject: [PATCH 8/9] btl/self: claim support of MCA_BTL_ATOMIC_SUPPORTS_GLOB The flag MCA_BTL_ATOMIC_SUPPORTS_GLOB indicates that a btl supports mix use CPU atomics with its own atomics. The btl/self supports this because for btl/self the two are the same thing. Therefore this patch added the flag to btl_atomic_flags of btl/self. Signed-off-by: Wei Zhang --- opal/mca/btl/self/btl_self_component.c | 1 + 1 file changed, 1 insertion(+) diff --git a/opal/mca/btl/self/btl_self_component.c b/opal/mca/btl/self/btl_self_component.c index 576716e4d73..c9fc5bdca0b 100644 --- a/opal/mca/btl/self/btl_self_component.c +++ b/opal/mca/btl/self/btl_self_component.c @@ -107,6 +107,7 @@ static int mca_btl_self_component_register(void) mca_btl_self.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_self.btl_min_rdma_pipeline_size = 0; mca_btl_self.btl_flags = MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_SEND; + mca_btl_self.btl_atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_GLOB; mca_btl_self.btl_bandwidth = 100; mca_btl_self.btl_latency = 0; mca_btl_base_param_register(&mca_btl_self_component.super.btl_version, &mca_btl_self); From 5d9e5a2c18daf3ce8d461a21b6775eeee7ead41b Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Mon, 27 Sep 2021 16:34:38 +0000 Subject: [PATCH 9/9] osc/rdma: adjust the way use_cpu_atomics is evaluated Currently, use_cpu_atomics is a variable in osc_rdma_module. However, whether CPU atomics can be used is a peer level decision. So this patch remove use_cpu_atomics from osc_rdma_module, and evaluate it for each peer. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma.h | 3 --- ompi/mca/osc/rdma/osc_rdma_component.c | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index f2243180073..85132768e8c 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -148,9 +148,6 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** CPU atomics can be used */ - bool use_cpu_atomics; - /** passive-target synchronization will not be used in this window */ bool no_locks; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 80a8c7da23d..5c347996c1a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -492,6 +492,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s size_t total_size, local_rank_array_size, leader_peer_data_size; ompi_osc_rdma_peer_t *my_peer; int ret, my_rank; + bool use_cpu_atomics; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state"); @@ -569,7 +570,9 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; - if (module->use_cpu_atomics) { + assert(NULL != my_peer->data_btl); + use_cpu_atomics = (my_peer->data_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; } else { @@ -588,7 +591,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (!module->use_cpu_atomics) { + if (!use_cpu_atomics) { if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { /* base is local and cpu atomics are available */ ex_peer->super.base_handle = module->state_handle; @@ -622,6 +625,7 @@ static int synchronize_errorcode(int errorcode, ompi_communicator_t *comm) static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, size_t size) { ompi_communicator_t *shared_comm; + bool use_cpu_atomics_on_peer; unsigned long offset, total_size; unsigned long state_base, data_base; int local_rank, local_size, ret; @@ -640,10 +644,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; - module->use_cpu_atomics = module->single_node; module->use_local_leader = true; for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); /* the usage of local leader means to use different channels to send data to peer and update peer's state. * When different channels are used, active message RDMA cannot guarantee that put and atomics are completed * in the same order. @@ -830,8 +832,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; + use_cpu_atomics_on_peer = (peer->data_btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); /* set up peer state */ - if (module->use_cpu_atomics) { + if (use_cpu_atomics_on_peer) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->state = (osc_rdma_counter_t) peer_state; @@ -864,7 +867,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor && - !module->use_cpu_atomics && temp[i].size && i > 0) { + !use_cpu_atomics_on_peer && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; peer->data_btl = local_leader->data_btl; @@ -873,7 +876,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ompi_osc_module_add_peer (module, peer); if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { - if (module->use_cpu_atomics && peer_rank == my_rank) { + if (use_cpu_atomics_on_peer && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; } /* nothing more to do */ @@ -889,7 +892,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (use_cpu_atomics_on_peer && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset;