From dff2464e4ee99582af00351ab521a48e27cc43fe Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 9 Feb 2022 17:33:42 +0000 Subject: [PATCH 01/17] btl/am-rdma: Pass the correct pointer to get The put over get implementation passed the wrong pointer to the cbdata of the get call, resulting in all kinds of badness when the get completed. This patch passes through the expected pointer. Signed-off-by: Brian Barrett (cherry picked from commit 8fedcc0a7cc7a24231587c6d4ca39a5bf72d4d05) --- opal/mca/btl/base/btl_base_am_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 2b1e3400195..ae9683936e6 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -701,7 +701,7 @@ static int am_rdma_target_put(mca_btl_base_module_t *btl, (struct mca_btl_base_registration_handle_t *) (*operation)->local_handle_data, (struct mca_btl_base_registration_handle_t *) (*operation)->remote_handle_data, hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, am_rdma_rdma_complete, - operation, NULL); + *operation, NULL); if (OPAL_SUCCESS != ret) { OBJ_RELEASE(*operation); } From 65ae652e1b8c3ea49ee119bbaa8bfe9dc7822947 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 7 Jan 2022 17:35:56 +0000 Subject: [PATCH 02/17] osc/rdma: Clean up initialization debugging Clean up language around btl selection phases to match rest of the code (accelerated/alternate). Also switch component initialization code to use opal_output_verbose rather than OSC_RDMA_VERBOSE, so that the debugging prints can be enabled on a non-debug build. Signed-off-by: Brian Barrett (cherry picked from commit b6b16a6cb769306d763859236e232d8a7e0e9fce) --- ompi/mca/osc/rdma/osc_rdma_component.c | 73 +++++++++++++++++--------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 42e93287225..3d4931e272b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -442,7 +442,8 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s int ret, my_rank; size_t memory_alignment = module->memory_alignment; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating private internal state"); my_rank = ompi_comm_rank (module->comm); @@ -598,7 +599,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s return allocate_state_single (module, base, size); } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating shared internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating shared internal state"); local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT (module); leader_peer_data_size = module->region_size * module->node_count; @@ -654,7 +656,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size); free (data_file); if (OPAL_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create shared memory segment"); } } } @@ -672,7 +675,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->segment_base = opal_shmem_segment_attach (&module->seg_ds); if (NULL == module->segment_base) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to attach to the shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to attach to the shared memory segment"); ret = OPAL_ERROR; } @@ -898,7 +902,8 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); if (NULL == btls_to_use) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); return OMPI_ERR_UNREACH; } @@ -908,20 +913,24 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o /* rdma and atomics are only supported with BTLs at the moment */ for (int i = 0 ; btls_to_use[i] ; ++i) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "checking for btl %s", btls_to_use[i]); OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { if (NULL != item->btl_module->btl_register_mem) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL"); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "skipping RDMA btl when searching for alternate BTL"); continue; } if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "skipping btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); continue; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "found alternate btl %s", btls_to_use[i]); ++btls_found; if (module) { @@ -1089,7 +1098,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } if (NULL == selected_btl) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found"); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: no suitable btls found"); return OMPI_ERR_NOT_AVAILABLE; } @@ -1100,8 +1110,9 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi module->use_memory_registration = selected_btl->btl_register_mem != NULL; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s", - selected_btl->btl_component->btl_version.mca_component_name); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: selected btl: %s", + selected_btl->btl_component->btl_version.mca_component_name); return OMPI_SUCCESS; } @@ -1150,7 +1161,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) module->region_size, MPI_BYTE, module->local_leaders, module->local_leaders->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "leader allgather failed with ompi error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "leader allgather failed with ompi error code %d", ret); break; } } @@ -1193,7 +1205,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) /* create a shared communicator to handle communication about the local segment */ ret = ompi_comm_split_type (module->comm, MPI_COMM_TYPE_SHARED, 0, NULL, &module->shared_comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create a shared memory communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create a shared memory communicator. error code %d", ret); return ret; } @@ -1204,7 +1217,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = ompi_comm_split (module->comm, (0 == local_rank) ? 0 : MPI_UNDEFINED, comm_rank, &module->local_leaders, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create local leaders communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create local leaders communicator. error code %d", ret); return ret; } @@ -1217,7 +1231,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = module->shared_comm->c_coll->coll_bcast (values, 2, MPI_INT, 0, module->shared_comm, module->shared_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to broadcast local data. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to broadcast local data. error code %d", ret); return ret; } } @@ -1350,8 +1365,9 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, return ret; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "creating osc/rdma window of flavor %d with id %d", - flavor, ompi_comm_get_cid(module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "creating osc/rdma window of flavor %d with id %d", + flavor, ompi_comm_get_cid(module->comm)); /* peer data */ if (world_size > init_limit) { @@ -1372,11 +1388,13 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* find rdma capable endpoints */ ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not find a suitable btl. falling back on " - "active-message BTLs"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "could not find an accelerated btl. falling back on " + "active-message BTLs"); ret = ompi_osc_rdma_query_alternate_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "no BTL available for RMA window"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "no BTL available for RMA window"); ompi_osc_rdma_free (win); return ret; } @@ -1428,7 +1446,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* notify all others if something went wrong */ ret = synchronize_errorcode(ret, module->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to allocate internal state"); ompi_osc_rdma_free (win); return ret; } @@ -1479,14 +1498,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to share window data with peers"); ompi_osc_rdma_free (win); } else { /* for now the leader is always rank 0 in the communicator */ module->leader = ompi_osc_rdma_module_peer (module, 0); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %d", - ompi_comm_get_cid(module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "finished creating osc/rdma window with id %d", + ompi_comm_get_cid(module->comm)); } return ret; From 2a155e9e74b3b8394eb778b88d218e13ebeb617c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 17:06:54 +0000 Subject: [PATCH 03/17] osc/rdma: Load all btls as alternate btls See lengthy comment in the change, but this patch removes the ability of users to specify a subset of available btls for use by the osc rdma component. The BTL interface was never designed for such usage (which is why there is no similar option for the OB1 PML) and it had clear places where it broke, so remove it. Signed-off-by: Brian Barrett (cherry picked from commit 61aca9e2de7d35c9016f1dc2cf1750c7ab2b559f) --- ompi/mca/osc/rdma/osc_rdma_component.c | 128 ++++++++++++++----------- 1 file changed, 70 insertions(+), 58 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 3d4931e272b..3560c51e05c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -35,6 +35,7 @@ #include "ompi_config.h" #include +#include #include "osc_rdma.h" #include "osc_rdma_frag.h" @@ -84,7 +85,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value); static char *ompi_osc_rdma_full_connectivity_btls; -static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, @@ -257,14 +257,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls); free(description_str); - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); - free(description_str); - if (0 == access ("/dev/shm", W_OK)) { mca_osc_rdma_component.backing_directory = "/dev/shm"; } else { @@ -875,77 +867,97 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) free(procs); } + +/* + * qsort() sorting function for ompi_osc_rdma_query_alternate_btls(), + * using latency as the sorting metric. + */ +static int btl_latency_sort_fn(const void *a, const void *b) +{ + const struct mca_btl_base_module_t *btl_a = a; + const struct mca_btl_base_module_t *btl_b = b; + + if (btl_a->btl_latency < btl_b->btl_latency) { + return -1; + } else if (btl_a->btl_latency == btl_b->btl_latency) { + return 0; + } else { + return 1; + } +} + + /** * @brief query for alternate BTLs * * @in comm Communicator to query - * @out module OSC module to store BTLs/count to (optional) - * @out + * @inout module OSC module to store BTLs/count to (optional) * * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL does not meet the - * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). - * Either it does not provide connectivity to all peers, provide - * remote completion, or natively support put/get/atomic.. Since more - * than one BTL may be needed for this support the OSC component will - * disable the use of registration-based RDMA (these BTLs will not be - * used) and will use any remaining BTL. By default the BTLs used will - * be tcp and sm but any single (or pair) of BTLs may be used. + * We directly use the active message rdma wrappers for alternate + * BTLs, in all cases. This greatly simplifies the alternate BTL + * impementation, at the expense of some performance. With the + * AM wrappers, we can always enforce remote completion and the lack + * of memory registration, at some performance cost. But we can use + * as many BTLs as we like. The module's btl list is sorted by + * latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest + * available latency btl to communicate with the peer. Unlike the OB1 + * PML, we only use one BTL per peer. + * + * Like the OB1 PML, there is no verification that there is at least + * one BTL that can communicate with every other peer in the window. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { mca_btl_base_selected_module_t *item; - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - int btls_found = 0; - - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - if (NULL == btls_to_use) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); - return OMPI_ERR_UNREACH; - } + int ret; - if (module) { - module->btls_in_use = 0; + /* shortcut the trivial query case */ + if (NULL == module) { + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { + return OMPI_ERR_UNREACH; + } + return OMPI_SUCCESS; } - /* rdma and atomics are only supported with BTLs at the moment */ - for (int i = 0 ; btls_to_use[i] ; ++i) { + module->btls_in_use = 0; + + /* add all alternate btls to the selected_btls list, not worrying + about ordering yet. We have to add all btls unless we want to + iterate over all endpoints to build the minimum set of btls + needed to communicate with all peers. An MCA parameter just + for osc rdma also wouldn't work, as the BML can decide not to + add an endpoint for a btl given the priority of another btl. + For example, it is not uncommon that the only endpoint created + to a peer on the same host is the sm btl's endpoint. If we + had an osc rdma specific parameter list, and the user + specified a combination not including sm, that would result in + an eventual failure, as no btl would be found to talk to ranks + on the same host.*/ + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "checking for btl %s", btls_to_use[i]); - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { - if (NULL != item->btl_module->btl_register_mem) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "skipping RDMA btl when searching for alternate BTL"); - continue; - } - - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); - continue; - } - - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "found alternate btl %s", btls_to_use[i]); - - ++btls_found; - if (module) { - mca_btl_base_am_rdma_init(item->btl_module); - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); - } - + "found alternate btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); + ret = mca_btl_base_am_rdma_init(item->btl_module); + if (OMPI_SUCCESS != ret) { + return ret; } + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); } - opal_argv_free (btls_to_use); + /* sort based on latency, lowest first */ + qsort(module->selected_btls, module->btls_in_use, + sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); - return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; + /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ + module->use_memory_registration = false; + + return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; } + /* Check for BTL requirements: * 1) RDMA (put/get) and ATOMIC operations. We only require cswap * and fetch and add and will emulate other opterations with those From 336aab4e717f7dd745ce35cd70cc9ec1c51c95b1 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 10 Jan 2022 22:51:03 +0000 Subject: [PATCH 04/17] osc/rdma: Simplify query logic With the change to load all btls in the alternate case and the removal of the logic to change priority based on alternate or accelerated btls, there's no point in running the full btl query in osc_rdma_component_query(). Instead, just verify that there is some number of BTLs available, which is the extent of testing in the alternate case. Signed-off-by: Brian Barrett (cherry picked from commit e93f041aa291d19c2c482023ae88ffc149c01395) --- ompi/mca/osc/rdma/osc_rdma_component.c | 42 +++++++++----------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 3560c51e05c..51e13b9d283 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -387,15 +387,14 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s } #endif /* OPAL_CUDA_SUPPORT */ - if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; - } - - if (OMPI_SUCCESS == ompi_osc_rdma_query_alternate_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; + /* verify if we have any btls available. Since we do not verify + * connectivity across all btls in the alternate case, this is as + * good a test as we are going to have for success. */ + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { + return -1; } - return OMPI_ERROR; + return OMPI_SUCCESS;; } static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) { @@ -914,13 +913,7 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o mca_btl_base_selected_module_t *item; int ret; - /* shortcut the trivial query case */ - if (NULL == module) { - if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { - return OMPI_ERR_UNREACH; - } - return OMPI_SUCCESS; - } + assert(NULL != module); module->btls_in_use = 0; @@ -988,9 +981,6 @@ static bool ompi_osc_rdma_check_accelerated_btl(struct mca_btl_base_module_t *bt * Testing (1) is expensive, so as an optimization, the * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL * components we know can achieve (1) in almost all usage scenarios. - * - * If module is NULL, the code acts as a query mechanism to find any - * potential BTLs, and is used to implement osc_rdma_query(). */ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { @@ -999,11 +989,11 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi mca_bml_base_endpoint_t *base_endpoint; char **btls_to_use; - if (module) { - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); - module->btls_in_use = 0; - module->use_memory_registration = false; - } + assert(NULL != module); + + ompi_osc_rdma_selected_btl_insert(module, NULL, 0); + module->btls_in_use = 0; + module->use_memory_registration = false; /* Check for BTLs in the list of BTLs we know can reach all peers in general usage. */ @@ -1116,11 +1106,9 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } btl_selection_complete: - if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; - } + ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); + module->btls_in_use = 1; + module->use_memory_registration = selected_btl->btl_register_mem != NULL; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "accelerated_query: selected btl: %s", From a160510b9b8e65a2f99bd2a4be7c3062181ae4bc Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 17:07:26 +0000 Subject: [PATCH 05/17] osc/rdma: Refactor btl wrapper code Refactor wrapper calls around the btl atomic code into their own header, removing them from the lock interface header. Clean up some resulting header dependency changes. Signed-off-by: Brian Barrett (cherry picked from commit 0394543786a55e90354829d80a5ba6eafa2c61b7) --- ompi/mca/osc/rdma/Makefile.am | 4 + ompi/mca/osc/rdma/osc_rdma_accumulate.c | 4 + ompi/mca/osc/rdma/osc_rdma_active_target.c | 27 --- ompi/mca/osc/rdma/osc_rdma_btl_comm.c | 61 ++++++ ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 218 +++++++++++++++++++++ ompi/mca/osc/rdma/osc_rdma_comm.c | 3 + ompi/mca/osc/rdma/osc_rdma_comm.h | 3 +- ompi/mca/osc/rdma/osc_rdma_lock.h | 197 +------------------ 8 files changed, 295 insertions(+), 222 deletions(-) create mode 100644 ompi/mca/osc/rdma/osc_rdma_btl_comm.c create mode 100644 ompi/mca/osc/rdma/osc_rdma_btl_comm.h diff --git a/ompi/mca/osc/rdma/Makefile.am b/ompi/mca/osc/rdma/Makefile.am index e52d0087743..4757ce6aa93 100644 --- a/ompi/mca/osc/rdma/Makefile.am +++ b/ompi/mca/osc/rdma/Makefile.am @@ -11,6 +11,8 @@ # Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. +# All Rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,6 +23,8 @@ rdma_sources = \ osc_rdma.h \ osc_rdma_module.c \ + osc_rdma_btl_comm.h \ + osc_rdma_btl_comm.c \ osc_rdma_comm.h \ osc_rdma_comm.c \ osc_rdma_accumulate.c \ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index ab0b21e539a..faa71692250 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -10,6 +10,8 @@ * Copyright (c) 2019-2021 Google, LLC. All rights reserved. * Copyright (c) 2021 IBM Corporation. All rights reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +22,8 @@ #include "osc_rdma_accumulate.h" #include "osc_rdma_request.h" #include "osc_rdma_comm.h" +#include "osc_rdma_lock.h" +#include "osc_rdma_btl_comm.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index fdd3dd5c832..3706a098f77 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -77,33 +77,6 @@ OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t, ompi_osc_rdma_pending_op_construct, ompi_osc_rdma_pending_op_destruct); -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); - - if (pending_op->op_result) { - memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); - } - - if (NULL != pending_op->cbfunc) { - pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); - } - - if (NULL != pending_op->op_frag) { - ompi_osc_rdma_frag_complete (pending_op->op_frag); - pending_op->op_frag = NULL; - } - - pending_op->op_complete = true; - OBJ_RELEASE(pending_op); -} /** * compare_ranks: diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.c b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c new file mode 100644 index 00000000000..cbc8a761593 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c @@ -0,0 +1,61 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2017-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "osc_rdma.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" + +#include "opal/mca/btl/base/base.h" + +void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); + + if (pending_op->op_result) { + memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); + } + + if (NULL != pending_op->cbfunc) { + pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); + } + + if (NULL != pending_op->op_frag) { + ompi_osc_rdma_frag_complete (pending_op->op_frag); + pending_op->op_frag = NULL; + } + + pending_op->op_complete = true; + OBJ_RELEASE(pending_op); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h new file mode 100644 index 00000000000..3f175a20fb6 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2019 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_RDMA_BTL_COMM_H +#define OSC_RDMA_BTL_COMM_H + +#include "osc_rdma_frag.h" + +#include "opal/mca/btl/btl.h" + + +void ompi_osc_rdma_atomic_complete(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status); + + +static inline int +ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, int op, + int64_t operand, int flags, int64_t *result, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret = OPAL_ERROR; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + + if (NULL != pending_op->op_frag) { + ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, + pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); + } else { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int op, int64_t operand, int flags, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret; + + if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, + NULL, wait_for_completion, cbfunc, cbdata, cbcontext); + } + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + /* spin until the btl has accepted the operation */ + do { + ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + if (cbfunc) { + cbfunc (cbdata, cbcontext, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int64_t compare, int64_t value, int flags, int64_t *result) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + OBJ_RETAIN(pending_op); + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + if (NULL != pending_op->op_frag) { + ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, + address, pending_op->op_frag->handle, address_handle, compare, + value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, + NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + } + + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } else { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + +#endif /* OSC_RDMA_BTL_COMM_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..0a7bf3285b2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -6,6 +6,8 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -14,6 +16,7 @@ */ #include "osc_rdma_comm.h" +#include "osc_rdma_frag.h" #include "osc_rdma_sync.h" #include "osc_rdma_request.h" #include "osc_rdma_dynamic.h" diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index efb305a571e..7b722c4fe69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,7 +19,6 @@ #include "osc_rdma_dynamic.h" #include "osc_rdma_request.h" #include "osc_rdma_sync.h" -#include "osc_rdma_lock.h" #define OMPI_OSC_RDMA_DECODE_MAX 64 diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..19a3249bdbe 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -5,6 +5,8 @@ * Copyright (c) 2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,6 +19,7 @@ #include "osc_rdma_types.h" #include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" static inline int ompi_osc_rdma_trylock_local (ompi_osc_rdma_atomic_lock_t *lock) { @@ -29,82 +32,6 @@ static inline void ompi_osc_rdma_unlock_local (ompi_osc_rdma_atomic_lock_t *lock (void) ompi_osc_rdma_lock_add (lock, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE); } -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status); - -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, int op, - int64_t operand, int flags, int64_t *result, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret = OPAL_ERROR; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, - pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); - } else { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, @@ -114,69 +41,6 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om operand, 0, result, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int op, int64_t operand, int flags, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, - NULL, wait_for_completion, cbfunc, cbdata, cbcontext); - } - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - /* spin until the btl has accepted the operation */ - do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - if (OPAL_LIKELY(1 == ret)) { - if (cbfunc) { - cbfunc (cbdata, cbcontext, OMPI_SUCCESS); - } - ret = OMPI_SUCCESS; - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) @@ -185,61 +49,6 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp operand, 0, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int64_t compare, int64_t value, int flags, int64_t *result) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - OBJ_RETAIN(pending_op); - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, - address, pending_op->op_frag->handle, address_handle, compare, - value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, - NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - } - - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } else { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) From 86c31efa28ed4c45df31b63d427d11e35306678e Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 18:25:44 +0000 Subject: [PATCH 06/17] osc/rdma: Wrap calls to BTL RDMA functiions Add wrappers to call all BTL RDMA functions. This commit just adds another layer of indirection to the communication calls. However, a follow-on patch will add logic to either call the BTL RDMA functions directly (for accelerated mode) or call the BTL AM RDMA compatibility layer for alternate mode. Signed-off-by: Brian Barrett (cherry picked from commit 6a15883e372242205a1434a0a35b2c931dc3933d) --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 7 +- ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 105 ++++++++++++++++++++++-- ompi/mca/osc/rdma/osc_rdma_comm.c | 22 ++--- 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index faa71692250..404546f2f08 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -762,9 +762,10 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { break; } diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h index 3f175a20fb6..a666d77710b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -26,6 +26,94 @@ void ompi_osc_rdma_atomic_complete(mca_btl_base_module_t *btl, struct mca_btl_ba void *context, void *data, int status); +static inline int +ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) + +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + uint64_t compare, uint64_t value, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); +} + + static inline int ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, @@ -62,10 +150,10 @@ ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); + ret = ompi_osc_rdma_btl_atomic_fop(module, btl_index, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); } if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { @@ -129,9 +217,9 @@ ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, /* spin until the btl has accepted the operation */ do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); + ret = ompi_osc_rdma_btl_atomic_op(module, btl_index, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { break; @@ -167,7 +255,6 @@ ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -184,7 +271,7 @@ ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, + ret = ompi_osc_rdma_btl_atomic_cswap(module, btl_index, endpoint, pending_op->op_buffer, address, pending_op->op_frag->handle, address_handle, compare, value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 0a7bf3285b2..ea8665c465f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -20,6 +20,7 @@ #include "osc_rdma_sync.h" #include "osc_rdma_request.h" #include "osc_rdma_dynamic.h" +#include "osc_rdma_btl_comm.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" @@ -99,9 +100,9 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); do { - ret = btl->btl_get (btl, endpoint, ptr, aligned_addr, - local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, - ompi_osc_get_data_complete, (void *) &read_complete, NULL); + ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, + local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_get_data_complete, (void *) &read_complete, NULL); if (!ompi_osc_rdma_oor (ret)) { break; } @@ -447,7 +448,6 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -457,9 +457,9 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee ompi_osc_rdma_sync_rdma_inc (sync); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER, - cb, context, cbdata); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + size, 0, MCA_BTL_NO_ORDER, cb, context, cbdata); if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } @@ -706,10 +706,10 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } do { - ret = btl->btl_get (btl, peer->data_endpoint, ptr, - aligned_source_base, local_handle, source_handle, - aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, - request, frag); + ret = ompi_osc_rdma_btl_get(module, peer->data_btl_index, peer->data_endpoint, + ptr, aligned_source_base, local_handle, source_handle, + aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_get_complete, request, frag); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } From 9d3d8339bc52b6efb9016d98e3e4777adbdb6c69 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 10 Jan 2022 23:58:48 +0000 Subject: [PATCH 07/17] osc/rdma: Split btl storage based on mode Split the storage of btls based on whether the window is using accelerated or alternate btls. This makes it more obvious when the code has made assumptions about mode that may not be true (such as the memory registration calls throughout the code that assumed selected_btl[0] was the one true BTL). Signed-off-by: Brian Barrett (cherry picked from commit 75e36e5e4dbfe0f211f1321224e9a1df92caf970) --- ompi/mca/osc/rdma/osc_rdma.h | 63 ++++++++++--------- ompi/mca/osc/rdma/osc_rdma_component.c | 74 +++++++++++++--------- ompi/mca/osc/rdma/osc_rdma_dynamic.c | 3 +- ompi/mca/osc/rdma/osc_rdma_module.c | 4 +- ompi/mca/osc/rdma/osc_rdma_peer.c | 85 ++++++++++++++++++-------- 5 files changed, 144 insertions(+), 85 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 2a8aeae156d..c052ce11a23 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -57,8 +57,6 @@ #define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count) -#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4 - enum { OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, OMPI_OSC_RDMA_LOCKING_ON_DEMAND, @@ -260,14 +258,23 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; - - /** BTL(s) in use. Currently this is only used to support RDMA emulation over - * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this - * could be used to support multiple RDMA-capable BTLs but the memory registration - * paths will need to be updated to pack/unpack multiple registration handles. */ - struct mca_btl_base_module_t **selected_btls; - uint8_t selected_btls_size; - uint8_t btls_in_use; + /* we currently support two modes of operation, a single + * accelerated btl (which can use memory registration and can use + * btl_flush() and one or more alternate btls, which cannot use + * flush() or rely on memory registration. Since it is an + * either/or situation, we use a union to simplify the code. + */ + bool use_accelerated_btl; + + union { + struct { + struct mca_btl_base_module_t *accelerated_btl; + }; + struct { + struct mca_btl_base_module_t **alternate_btls; + uint8_t alternate_btl_count; + }; + }; /** Only true if one BTL is in use. Memory registration is only supported when * using a single BTL. */ @@ -383,10 +390,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file) { if (module->use_memory_registration) { + assert(module->use_accelerated_btl); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)", ptr, (void*)((char *) ptr + size), size); - *handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags); + *handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags); if (OPAL_UNLIKELY(NULL == *handle)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, " "size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line); @@ -404,7 +412,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file) { if (handle) { - module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle); + assert(module->use_memory_registration); + assert(module->use_accelerated_btl); + module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle); } } @@ -536,10 +546,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module) { #if defined(BTL_VERSION) && (BTL_VERSION >= 310) - return !!(module->selected_btls[0]->btl_flush); -#else - return false; + if (module->use_accelerated_btl) { + return (NULL != module->accelerated_btl->btl_flush); + } #endif + return false; } /** @@ -601,13 +612,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) opal_progress (); } while (ompi_osc_rdma_sync_get_count (sync)); #else - mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]; - do { if (!ompi_osc_rdma_use_btl_flush (sync->module)) { opal_progress (); } else { - btl_module->btl_flush (btl_module, NULL); + assert(sync->module->use_accelerated_btl); + mca_btl_base_module_t *btl_module = sync->module->accelerated_btl; + btl_module->btl_flush(btl_module, NULL); } } while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1))); #endif @@ -637,17 +648,13 @@ static inline bool ompi_osc_rdma_oor (int rc) __opal_attribute_always_inline__ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { - return module->selected_btls[btl_index]; -} - -__opal_attribute_always_inline__ -static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { - if(btl_index == module->selected_btls_size) { - module->selected_btls_size *= 2; - module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *)); - assert(NULL != module->selected_btls); + if (module->use_accelerated_btl) { + assert(0 == btl_index); + return module->accelerated_btl; + } else { + assert(btl_index < module->alternate_btl_count); + return module->alternate_btls[btl_index]; } - module->selected_btls[btl_index] = btl; } #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 51e13b9d283..38d0c41d834 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -410,6 +410,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void region->len = size; if (module->use_memory_registration && size) { + assert(module->use_accelerated_btl); if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) { ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY, &module->base_handle); @@ -417,9 +418,9 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_ERR_OUT_OF_RESOURCE; } - memcpy (region->btl_handle_data, module->base_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->base_handle, module->accelerated_btl->btl_registration_handle_size); } else { - memcpy (region->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } } @@ -580,8 +581,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->use_cpu_atomics = module->single_node; if (!module->single_node) { - for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + if (module->use_accelerated_btl) { + module->use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } else { + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + module->use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } } } @@ -703,14 +708,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (0 == local_rank) { /* unlink the shared memory backing file */ opal_shmem_unlink (&module->seg_ds); - /* just go ahead and register the whole segment */ - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, - MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - state_region->base = (intptr_t) module->segment_base; - if (module->state_handle) { - memcpy (state_region->btl_handle_data, module->state_handle, - module->selected_btls[0]->btl_registration_handle_size); + if (module->use_accelerated_btl) { + /* just go ahead and register the whole segment */ + ret = ompi_osc_rdma_register(module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, + MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + state_region->base = (intptr_t) module->segment_base; + if (module->state_handle) { + memcpy(state_region->btl_handle_data, module->state_handle, + module->accelerated_btl->btl_registration_handle_size); + } } } } @@ -730,8 +737,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s region->base = state_region->base + my_base_offset; region->len = size; if (module->use_memory_registration) { - memcpy (region->btl_handle_data, state_region->btl_handle_data, - module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, state_region->btl_handle_data, + module->accelerated_btl->btl_registration_handle_size); } } @@ -910,12 +918,23 @@ static int btl_latency_sort_fn(const void *a, const void *b) */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { + size_t btl_count; + size_t index = 0; mca_btl_base_selected_module_t *item; int ret; assert(NULL != module); - module->btls_in_use = 0; + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); + if (btl_count > UINT8_MAX) { + return OMPI_ERROR; + } + + module->alternate_btl_count = btl_count; + module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count); + if (NULL == module->alternate_btls) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } /* add all alternate btls to the selected_btls list, not worrying about ordering yet. We have to add all btls unless we want to @@ -937,17 +956,17 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o if (OMPI_SUCCESS != ret) { return ret; } - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); + module->alternate_btls[index++] = item->btl_module; } + assert(index == btl_count); /* sort based on latency, lowest first */ - qsort(module->selected_btls, module->btls_in_use, + qsort(module->alternate_btls, module->alternate_btl_count, sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); - /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ module->use_memory_registration = false; - return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; + return OMPI_SUCCESS; } @@ -991,8 +1010,7 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi assert(NULL != module); - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); - module->btls_in_use = 0; + module->use_accelerated_btl = false; module->use_memory_registration = false; /* Check for BTLs in the list of BTLs we know can reach all peers @@ -1106,8 +1124,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } btl_selection_complete: - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; + module->use_accelerated_btl = true; + module->accelerated_btl = selected_btl; module->use_memory_registration = selected_btl->btl_register_mem != NULL; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, @@ -1152,7 +1170,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) my_data->len = (osc_rdma_size_t) my_rank; if (module->use_memory_registration && module->state_handle) { - memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy (my_data->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } /* gather state data at each node leader */ @@ -1326,9 +1345,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->network_amo_max_count = mca_osc_rdma_component.network_amo_max_count; - module->selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT; - module->selected_btls = calloc(module->selected_btls_size, sizeof(struct mca_btl_base_module_t *)); - module->all_sync.module = module; module->flavor = flavor; @@ -1386,6 +1402,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } /* find rdma capable endpoints */ + module->use_accelerated_btl = false; ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, @@ -1404,7 +1421,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->region_size = sizeof (ompi_osc_rdma_region_t); if (module->use_memory_registration) { - module->region_size += module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + module->region_size += module->accelerated_btl->btl_registration_handle_size; } module->state_size = sizeof (ompi_osc_rdma_state_t); diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index 8adfa7f8159..61e14fea56c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -252,7 +252,8 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) return OMPI_ERR_RMA_ATTACH; } - memcpy (region->btl_handle_data, handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, handle, module->accelerated_btl->btl_registration_handle_size); rdma_region_handle->btl_handle = handle; } else { rdma_region_handle->btl_handle = NULL; diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index d04e418a9e2..5247c604dd0 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -144,7 +144,9 @@ int ompi_osc_rdma_free(ompi_win_t *win) free (module->outstanding_lock_array); mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); - free (module->selected_btls); + if (!module->use_accelerated_btl) { + free(module->alternate_btls); + } free (module); return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..6286fedeb69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -40,43 +40,74 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul struct mca_btl_base_endpoint_t **endpoint) { ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id); - mca_bml_base_endpoint_t *bml_endpoint; - int num_btls; - - /* for now just use the bml to get the btl endpoint */ - bml_endpoint = mca_bml_base_get_endpoint (proc); - - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); - - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; - return OMPI_SUCCESS; - } + mca_bml_base_endpoint_t *bml_endpoint = mca_bml_base_get_endpoint(proc); + + if (module->use_accelerated_btl) { + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: accelerated btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, + module->accelerated_btl); + if (NULL != bml_btl) { + *btl_index_out = 0; + *endpoint = bml_btl->btl_endpoint; + + return OMPI_SUCCESS; } - } + } else { + mca_bml_base_btl_t *bml_btl; + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: alternate btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + + /* the non accelerated case is a bit difficult compared to the + * accelerated case. The right BTL could be in either the + * rdma or eager endpoint list, because we're using the am + * rdma interface to provide RDMA semantics. The important + * part is that we search the alternate_btls list in order, + * since it is sorted by latency. + */ + for (int osc_btl_idx = 0 ; osc_btl_idx < module->alternate_btl_count ; ++osc_btl_idx) { + mca_btl_base_module_t *search_btl = ompi_osc_rdma_selected_btl(module, osc_btl_idx); + const char *source = NULL; + + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d comparing with btl %s, %d", + ompi_comm_rank(module->comm), + search_btl->btl_component->btl_version.mca_component_name, + osc_btl_idx); + + source = "rdma"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, search_btl); + if (NULL == bml_btl) { + source = "eager"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, search_btl); + } + if (NULL != bml_btl) { + *btl_index_out = osc_btl_idx; + *endpoint = bml_btl->btl_endpoint; - /* if this is a non-RDMA btl then the endpoint may be listed under eager */ - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d found btl for peer %d (%s, %d, %s)", + ompi_comm_rank(module->comm), peer_id, + bml_btl->btl->btl_component->btl_version.mca_component_name, + osc_btl_idx, source); - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } } } + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "rank %d: failed peer search for peer %d", + ompi_comm_rank(module->comm), peer_id); + /* unlikely but can happen when creating a peer for self */ return OMPI_ERR_UNREACH; } int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) { - struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_base_endpoint_t *endpoint = NULL; ompi_osc_rdma_peer_t *peer; uint8_t module_btl_index = UINT8_MAX; @@ -84,8 +115,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, /* find a btl/endpoint to use for this peer */ int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !((module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - peer_id == ompi_comm_rank (module->comm)))) { + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -134,7 +164,8 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank); if (module->use_memory_registration) { - registration_handle_size = module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + registration_handle_size = module->accelerated_btl->btl_registration_handle_size; } /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code From 6020c87ede6674fdda3af47abc189d7412989a80 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:09:36 +0000 Subject: [PATCH 08/17] osc/rdma: Remove duplicate op emulation check ompi_osc_rdma_btl_op() already includes a check to emulate a remote op with a remote fetch-and-op, so there's no need for a second check in ompi_osc_rdma_acc_single_atomic(). Signed-off-by: Brian Barrett (cherry picked from commit 222b7f3f23202597ff925173f6140c1260e720c2) --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 404546f2f08..120ffbf42d3 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -244,12 +244,6 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo int btl_op, flags; int64_t origin; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - /* btl put atomics not supported or disabled. fall back on fetch-and-op */ - return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, - op, req); - } - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { From 8593ed37eb3310be31a7d01d956df234c217fd65 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:25:41 +0000 Subject: [PATCH 09/17] osc/rdma: Remove usage of custom min() function Use the opal_min() function instead of a custom macro. Signed-off-by: Brian Barrett (cherry picked from commit 6e2af1887ed899eabb60f23a6b8fa31666cc47fc) --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 7 +++++-- ompi/mca/osc/rdma/osc_rdma_comm.c | 7 +++++-- ompi/mca/osc/rdma/osc_rdma_comm.h | 1 - 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 120ffbf42d3..41d204ed059 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -19,12 +19,15 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_accumulate.h" #include "osc_rdma_request.h" #include "osc_rdma_comm.h" #include "osc_rdma_lock.h" #include "osc_rdma_btl_comm.h" +#include "opal/util/minmax.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" @@ -583,9 +586,9 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* determine how much to put in this operation */ if (source_count) { - acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); + acc_len = opal_min(opal_min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); } else { - acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit); + acc_len = opal_min(target_iovec[target_iov_index].iov_len, acc_limit); } if (0 != acc_len) { diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index ea8665c465f..17448892870 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -15,6 +15,8 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_comm.h" #include "osc_rdma_frag.h" #include "osc_rdma_sync.h" @@ -22,8 +24,9 @@ #include "osc_rdma_dynamic.h" #include "osc_rdma_btl_comm.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" +#include "opal/util/minmax.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" /* helper functions */ static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag, @@ -246,7 +249,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc assert (0 != local_iov_count); /* determine how much to transfer in this operation */ - rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); + rdma_len = opal_min(opal_min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); /* execute the get */ if (!subreq && alloc_reqs) { diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index 7b722c4fe69..82c1e873263 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -22,7 +22,6 @@ #define OMPI_OSC_RDMA_DECODE_MAX 64 -#define min(a,b) ((a) < (b) ? (a) : (b)) #define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0) /** From 6f92663dd76ad5f12d7cb5f57755d48947afe408 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:39:01 +0000 Subject: [PATCH 10/17] osc/rdma: Clean up use_cpu_atomics behaivor The use_cpu_atomics member of the rdma module was only used during initialization, so there was no reason for it to be a module member. Also clean up the initialization logic for the field (and therefore, whether or not the shared state optimization is used). Previously, it appeared to be possible to use with alternate btls across multiple nodes (ie, we tested the GLOB value on the btls), but the use_cpu_atomics field was initialized to false in the multi-node case and so they would never actually be enabled. Make that more obvious, rather than try to enable the optimization, to reduce the risk of introducing new datapath bugs. Signed-off-by: Brian Barrett (cherry picked from commit 94fed3b47278b5b540058f88a62931d3ac3f7d3b) --- ompi/mca/osc/rdma/osc_rdma.h | 3 -- ompi/mca/osc/rdma/osc_rdma_component.c | 40 +++++++++++++++----------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index c052ce11a23..c70dacd4b0d 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -148,9 +148,6 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** CPU atomics can be used */ - bool use_cpu_atomics; - /** passive-target synchronization will not be used in this window */ bool no_locks; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 38d0c41d834..dc7e727a930 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -427,7 +427,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_SUCCESS; } -static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size) +static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size, bool use_cpu_atomics) { size_t total_size, local_rank_array_size, leader_peer_data_size, base_data_size; ompi_osc_rdma_peer_t *my_peer; @@ -507,7 +507,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; } else { @@ -526,7 +526,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (!module->use_cpu_atomics) { + if (!use_cpu_atomics) { if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { /* base is local and cpu atomics are available */ ex_peer->super.base_handle = module->state_handle; @@ -570,6 +570,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s struct _local_data *temp; char *data_file; size_t memory_alignment = module->memory_alignment; + bool use_cpu_atomics; shared_comm = module->shared_comm; @@ -578,21 +579,26 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; - module->use_cpu_atomics = module->single_node; - if (!module->single_node) { - if (module->use_accelerated_btl) { - module->use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } else { - for (int i = 0 ; i < module->alternate_btl_count ; ++i) { - module->use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } - } + if (module->single_node) { + use_cpu_atomics = true; + } else if (module->use_accelerated_btl) { + use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } else { + /* using the shared state optimization that is enabled by + * being able to use cpu atomics was never enabled for + * alternate btls, due to a previous bug in the enablement + * logic when alternate btls were first supported. It is + * likely that this optimization could work with sufficient + * testing, but for now, always disable to not introduce new + * correctness risks. + */ + use_cpu_atomics = false; } if (1 == local_size) { /* no point using a shared segment if there are no other processes on this node */ - return allocate_state_single (module, base, size); + return allocate_state_single (module, base, size, use_cpu_atomics); } opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, @@ -771,7 +777,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; /* set up peer state */ - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->state = (osc_rdma_counter_t) peer_state; @@ -796,7 +802,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor && - !module->use_cpu_atomics && temp[i].size && i > 0) { + !use_cpu_atomics && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; peer->data_btl_index = local_leader->data_btl_index; @@ -805,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ompi_osc_module_add_peer (module, peer); if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { - if (module->use_cpu_atomics && peer_rank == my_rank) { + if (use_cpu_atomics && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; } /* nothing more to do */ @@ -821,7 +827,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset; From a1180cc7eac6a1bdc3001772a4f999de6caecebd Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 00:06:11 +0000 Subject: [PATCH 11/17] osc/rdma: Use BTL am-rdma explicit interface Switch from using the implicit BTL interface (where the am-rdma interface just extends missing functionality in the BTL) to the new explicit interface (where the OSC RDMA interface is the only maintainer of the BTL list. With this change, alternate BTLs do not have to support REMOTE_COMPLETION to be selected (because the AM RDMA interface always provides remote completion when we request it, as this patch does). Any BTL that supports Active Messages (ie, all of them) should be able to support the OSC RDMA required semantics, eliminating the problem of creating windows with no servicable BTLs. Signed-off-by: Brian Barrett (cherry picked from commit 4080d5d29ff4e9ee225f7b4937db89bac6aa1f8f) --- ompi/mca/osc/rdma/osc_rdma.h | 29 +++- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 40 +++--- ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 77 ++++++++--- ompi/mca/osc/rdma/osc_rdma_comm.c | 174 ++++++++++++------------ ompi/mca/osc/rdma/osc_rdma_component.c | 61 +++++++-- ompi/mca/osc/rdma/osc_rdma_module.c | 5 +- 6 files changed, 239 insertions(+), 147 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index c70dacd4b0d..94b6cb641c8 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -44,6 +44,7 @@ #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/btl_base_am_rdma.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" @@ -255,6 +256,8 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /* ******************* communication *********************** */ + /* we currently support two modes of operation, a single * accelerated btl (which can use memory registration and can use * btl_flush() and one or more alternate btls, which cannot use @@ -265,18 +268,27 @@ struct ompi_osc_rdma_module_t { union { struct { - struct mca_btl_base_module_t *accelerated_btl; + mca_btl_base_module_t *accelerated_btl; }; struct { - struct mca_btl_base_module_t **alternate_btls; + mca_btl_base_am_rdma_module_t **alternate_am_rdmas; uint8_t alternate_btl_count; }; }; - /** Only true if one BTL is in use. Memory registration is only supported when - * using a single BTL. */ + /** Does the selected BTL require memory registration? This field + will be false when alternate BTLs are used, and the value + when an accelerated BTL is used depends on the registration + requirements of the underlying BTL. */ bool use_memory_registration; + size_t put_alignment; + size_t get_alignment; + size_t put_limit; + size_t get_limit; + + uint32_t atomic_flags; + /** registered fragment used for locally buffered RDMA transfers */ struct ompi_osc_rdma_frag_t *rdma_frag; @@ -650,8 +662,15 @@ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_m return module->accelerated_btl; } else { assert(btl_index < module->alternate_btl_count); - return module->alternate_btls[btl_index]; + return module->alternate_am_rdmas[btl_index]->btl; } } + +static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) { + assert(!module->use_accelerated_btl); + assert(btl_index < module->alternate_btl_count); + return module->alternate_am_rdmas[btl_index]; +} + #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 41d204ed059..0cec49a1d80 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -164,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -242,13 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -663,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; int flags, ret; - if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) { return OMPI_ERR_NOT_SUPPORTED; } @@ -717,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -742,18 +735,21 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, return OMPI_SUCCESS; } - if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - break; - } + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + if (len > btl->btl_put_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + break; + } - ompi_osc_rdma_progress (module); - } while (1); + ompi_osc_rdma_progress (module); + } while (1); - memcpy (ptr, source_addr, len); - local_handle = frag->handle; + memcpy(ptr, source_addr, len); + local_handle = frag->handle; + } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h index a666d77710b..718036d0a76 100644 --- a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -36,11 +36,17 @@ ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_put(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_put(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -54,11 +60,18 @@ ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - return btl->btl_get(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_get(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -71,6 +84,9 @@ ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, { mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + /* the AM BTL interface does not currently support op calls */ + assert(module->use_accelerated_btl); + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, op, operand, flags, order, cbfunc, cbcontext, cbdata); @@ -87,12 +103,19 @@ ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - op, operand, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_fop(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -105,12 +128,19 @@ ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index uint64_t compare, uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - compare, value, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_cswap(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -195,7 +225,10 @@ ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* if using the AM RDMA interface with alternate BTLs or if the + accelerated BTL does not support atomic ops, emulate the atomic + op over a fetch and atomic op */ + if (!module->use_accelerated_btl || !(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 17448892870..62f35d28d64 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -65,8 +65,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; volatile bool read_complete = false; @@ -82,25 +81,28 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len, (unsigned long) aligned_len); - if (btl->btl_register_mem && len >= btl->btl_get_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { - ompi_osc_rdma_progress (module); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + if (len >= btl->btl_get_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { + ompi_osc_rdma_progress(module); + } + } while (OMPI_ERR_OUT_OF_RESOURCE == ret); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); + return ret; } - } while (OMPI_ERR_OUT_OF_RESOURCE == ret); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); - return ret; + local_handle = frag->handle; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, + (void *) frag); } - - local_handle = frag->handle; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, - (void *) frag); } - assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); + assert (!(source_address & ALIGNMENT_MASK(module->get_alignment))); do { ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, @@ -487,7 +489,6 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -495,16 +496,19 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * void *cbcontext; int ret; - if (btl->btl_register_mem && size > btl->btl_put_local_registration_threshold) { - ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_put_local_registration_threshold) { + ret = ompi_osc_rdma_frag_alloc(module, size, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_register(module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + memcpy(ptr, source_buffer, size); + local_handle = frag->handle; } - } else { - memcpy (ptr, source_buffer, size); - local_handle = frag->handle; } } @@ -606,8 +610,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; osc_rdma_size_t aligned_len; @@ -623,70 +626,73 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); - if ((btl->btl_register_mem && size > btl->btl_get_local_registration_threshold) || - (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { - /* region is too large for a buffered read */ - size_t subsize; + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_get_local_registration_threshold || + (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { - /* remote region has the same alignment but the base is not aligned. perform a small - * buffered get of the beginning of the remote region */ - aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); - subsize = (size_t) (aligned_source_base - source_address); - - ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial(sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; } - source_address += subsize; - target_buffer = (void *) ((intptr_t) target_buffer + subsize); - size -= subsize; + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial(sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } - aligned_len = aligned_source_bound - aligned_source_base; + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc(aligned_len); + } else { + ptr = target_buffer; } - if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && - (size & btl_alignment_mask)) { - /* remote region bases are aligned but the bounds are not. perform a - * small buffered get of the end of the remote region */ - aligned_len = size & ~btl_alignment_mask; - subsize = size - aligned_len; - size = aligned_len; - ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, - (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + if (NULL != ptr) { + (void)ompi_osc_rdma_register(module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, + &local_handle); } - /* (remaining) user request is now correctly aligned */ - } - if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - /* local and remote alignments differ */ - request->buffer = ptr = malloc (aligned_len); + if (OPAL_UNLIKELY(NULL == local_handle)) { + free(request->buffer); + request->buffer = NULL; + return ret; + } } else { - ptr = target_buffer; - } - - if (NULL != ptr) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, - &local_handle); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); + local_handle = frag->handle; } - - if (OPAL_UNLIKELY(NULL == local_handle)) { - free (request->buffer); - request->buffer = NULL; - return ret; - } - } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); - local_handle = frag->handle; } } @@ -742,7 +748,6 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -777,7 +782,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle, target_count, target_datatype, request, - btl->btl_put_limit, ompi_osc_rdma_put_contig, false); + module->put_limit, ompi_osc_rdma_put_contig, false); } static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, @@ -785,7 +790,6 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; @@ -818,7 +822,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address, source_handle, source_count, source_datatype, request, - btl->btl_get_limit, ompi_osc_rdma_get_contig, true); + module->get_limit, ompi_osc_rdma_get_contig, true); } int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, ptrdiff_t target_disp, int target_count, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index dc7e727a930..0e3be91f997 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -51,6 +51,7 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/util/sys_limits.h" +#include "opal/util/minmax.h" #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ @@ -887,12 +888,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) */ static int btl_latency_sort_fn(const void *a, const void *b) { - const struct mca_btl_base_module_t *btl_a = a; - const struct mca_btl_base_module_t *btl_b = b; + const mca_btl_base_am_rdma_module_t * const *am_rdma_a_p = a; + const mca_btl_base_am_rdma_module_t * const *am_rdma_b_p = b; + const mca_btl_base_am_rdma_module_t *am_rdma_a = *am_rdma_a_p; + const mca_btl_base_am_rdma_module_t *am_rdma_b = *am_rdma_b_p; - if (btl_a->btl_latency < btl_b->btl_latency) { + if (am_rdma_a->btl->btl_latency < am_rdma_b->btl->btl_latency) { return -1; - } else if (btl_a->btl_latency == btl_b->btl_latency) { + } else if (am_rdma_a->btl->btl_latency == am_rdma_b->btl->btl_latency) { return 0; } else { return 1; @@ -931,14 +934,19 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o assert(NULL != module); + module->put_alignment = 1; + module->get_alignment = 1; + module->put_limit = SIZE_MAX; + module->get_limit = SIZE_MAX; + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); if (btl_count > UINT8_MAX) { return OMPI_ERROR; } module->alternate_btl_count = btl_count; - module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count); - if (NULL == module->alternate_btls) { + module->alternate_am_rdmas = malloc(sizeof(struct mca_btl_base_am_rdma_module_t *) * module->alternate_btl_count); + if (NULL == module->alternate_am_rdmas) { return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } @@ -958,19 +966,43 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "found alternate btl %s", item->btl_module->btl_component->btl_version.mca_component_name); - ret = mca_btl_base_am_rdma_init(item->btl_module); + + ret = opal_btl_base_am_rdma_create(item->btl_module, + MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, + true /* no_memory_registration */, + &(module->alternate_am_rdmas[index])); if (OMPI_SUCCESS != ret) { return ret; } - module->alternate_btls[index++] = item->btl_module; + + module->put_alignment = opal_max(module->put_alignment, + module->alternate_am_rdmas[index]->am_btl_put_alignment); + module->get_alignment = opal_max(module->get_alignment, + module->alternate_am_rdmas[index]->am_btl_get_alignment); + module->put_limit = opal_min(module->put_limit, + module->alternate_am_rdmas[index]->am_btl_put_limit); + module->get_limit = opal_min(module->get_limit, + module->alternate_am_rdmas[index]->am_btl_get_limit); + + index++; } - assert(index == btl_count); + assert(index == module->alternate_btl_count); /* sort based on latency, lowest first */ - qsort(module->alternate_btls, module->alternate_btl_count, - sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); + qsort(module->alternate_am_rdmas, module->alternate_btl_count, + sizeof(module->alternate_am_rdmas[0]), btl_latency_sort_fn); module->use_memory_registration = false; + module->atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_AND | + MCA_BTL_ATOMIC_SUPPORTS_OR | + MCA_BTL_ATOMIC_SUPPORTS_XOR | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_MIN | + MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_GLOB; return OMPI_SUCCESS; } @@ -1132,7 +1164,12 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi btl_selection_complete: module->use_accelerated_btl = true; module->accelerated_btl = selected_btl; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; + module->use_memory_registration = (selected_btl->btl_register_mem != NULL); + module->put_alignment = selected_btl->btl_put_alignment; + module->get_alignment = selected_btl->btl_get_alignment; + module->put_limit = selected_btl->btl_put_limit; + module->get_limit = selected_btl->btl_get_limit; + module->atomic_flags = selected_btl->btl_atomic_flags; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "accelerated_query: selected btl: %s", diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 5247c604dd0..64ad248b206 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -145,7 +145,10 @@ int ompi_osc_rdma_free(ompi_win_t *win) mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); if (!module->use_accelerated_btl) { - free(module->alternate_btls); + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + OBJ_RELEASE(module->alternate_am_rdmas[i]); + } + free(module->alternate_am_rdmas); } free (module); From 2eaa8b67e88de3d70997d2c5ddee1239f18933f6 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 8 Feb 2022 16:27:06 +0000 Subject: [PATCH 12/17] osc/rdma: Lower priority to 20 The RDMA OSC component is the "general" component, similar to OB1. The OSC subsystem has gone through some priority inflation over the years, so try to clean that up. This patch lowers the RDMA OSC component priority from 101 to 20, leaving the priorities as: sm 100 ucx 60 (if API version > 1.5.0) portals 20 rdma 20 Signed-off-by: Brian Barrett (cherry picked from commit 5246b669c4debe424606fc6890fd6e37a6a2b96c) --- ompi/mca/osc/rdma/osc_rdma_component.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 0e3be91f997..6f4fcabe693 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -231,7 +231,7 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); free(description_str); - mca_osc_rdma_component.priority = 101; + mca_osc_rdma_component.priority = 20; opal_asprintf(&description_str, "Priority of the osc/rdma component (default: %d)", mca_osc_rdma_component.priority); (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str, From 38dc8beeb1cf04fc5294de57ae4819479a802367 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 9 Feb 2022 18:04:08 +0000 Subject: [PATCH 13/17] osc/rdma: Fix double assignment typo Fix an assignment of request onto itself, which was a harmless typo, but was responsible for Coverity issues CID 1429865 and CID 1429864. Signed-off-by: Brian Barrett (cherry picked from commit ad9fae9b171da54f984275b5b9781dd4c3212501) --- ompi/mca/osc/rdma/osc_rdma_comm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 62f35d28d64..569cf669cae 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -398,7 +398,7 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); sync = request->sync; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) { @@ -429,7 +429,7 @@ static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); module = request->module; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) { From 84041c685954f81b1ef3a5935d27945e7844a075 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 8 Feb 2022 04:01:20 +0000 Subject: [PATCH 14/17] osc/sm: Add MCA parameter to set priority For some reason, the SM OSC component was one of the few components which did not have an MCA parameter to set priority. This patch fixes that. Signed-off-by: Brian Barrett (cherry picked from commit 11127441c35fcb1dd09da418986806c89589573d) --- ompi/mca/osc/sm/osc_sm.h | 3 +++ ompi/mca/osc/sm/osc_sm_component.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/ompi/mca/osc/sm/osc_sm.h b/ompi/mca/osc/sm/osc_sm.h index 4ea903d03e2..b97329bfc63 100644 --- a/ompi/mca/osc/sm/osc_sm.h +++ b/ompi/mca/osc/sm/osc_sm.h @@ -53,6 +53,9 @@ typedef struct ompi_osc_sm_node_state_t ompi_osc_sm_node_state_t; struct ompi_osc_sm_component_t { ompi_osc_base_component_t super; + /** Priority of the osc/sm component */ + unsigned int priority; + char *backing_directory; }; typedef struct ompi_osc_sm_component_t ompi_osc_sm_component_t; diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index c05a6a2d554..c78dcca3f30 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -115,6 +115,8 @@ ompi_osc_sm_module_t ompi_osc_sm_module_template = { static int component_register (void) { + char *description_str; + if (0 == access ("/dev/shm", W_OK)) { mca_osc_sm_component.backing_directory = "/dev/shm"; } else { @@ -128,6 +130,16 @@ static int component_register (void) MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_sm_component.backing_directory); + mca_osc_sm_component.priority = 100; + opal_asprintf(&description_str, "Priority of the osc/sm component (default: %d)", + mca_osc_sm_component.priority); + (void)mca_base_component_var_register(&mca_osc_sm_component.super.osc_version, + "priority", description_str, + MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, + OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, + &mca_osc_sm_component.priority); + free(description_str); + return OPAL_SUCCESS; } @@ -183,7 +195,7 @@ component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, return ret; } - return 100; + return mca_osc_sm_component.priority; } From 89af45c00cd5e67b7a75036fa12adf81826cc0ec Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 8 Feb 2022 00:49:20 +0000 Subject: [PATCH 15/17] osc: Fix WIN_SHARED initialization Fix WIN_SHARED initialization and error reporting by pushing the actual generation of MPI_ERR_RMA_SHARED errors until component selection (from component query). Components that don't do FLAVOR_SHARED should just return -1 for query. SM, since it should always work for FLAVOR_SHARED, will return a priority for component_query() for any FLAVOR_SHARED window, then will generate the MPI_ERR_RMA_SHARED error during component_select. Signed-off-by: Brian Barrett (cherry picked from commit efec44c7e25d72436b170a3e5501e4c4d59f981c) --- ompi/mca/osc/base/osc_base_init.c | 4 -- .../osc/monitoring/osc_monitoring_component.c | 4 -- ompi/mca/osc/rdma/osc_rdma_component.c | 2 +- ompi/mca/osc/sm/osc_sm_component.c | 40 +++++++++---------- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/ompi/mca/osc/base/osc_base_init.c b/ompi/mca/osc/base/osc_base_init.c index bd949b6d242..41d8cb03afb 100644 --- a/ompi/mca/osc/base/osc_base_init.c +++ b/ompi/mca/osc/base/osc_base_init.c @@ -57,10 +57,6 @@ ompi_osc_base_select(ompi_win_t *win, priority = component->osc_query(win, base, size, disp_unit, comm, win->super.s_info, flavor); if (priority < 0) { - if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) { - /* NTH: quick fix to return OMPI_ERR_RMA_SHARED */ - return OMPI_ERR_RMA_SHARED; - } continue; } diff --git a/ompi/mca/osc/monitoring/osc_monitoring_component.c b/ompi/mca/osc/monitoring/osc_monitoring_component.c index f91cd8f1160..26b49ea865b 100644 --- a/ompi/mca/osc/monitoring/osc_monitoring_component.c +++ b/ompi/mca/osc/monitoring/osc_monitoring_component.c @@ -90,10 +90,6 @@ static int mca_osc_monitoring_component_select(struct ompi_win_t *win, void **ba priority = component->osc_query(win, base, size, disp_unit, comm, info, flavor); if (priority < 0) { - if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) { - /* NTH: quick fix to return OMPI_ERR_RMA_SHARED */ - return OMPI_ERR_RMA_SHARED; - } continue; } diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 6f4fcabe693..02cf8767a71 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -376,7 +376,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s { if (MPI_WIN_FLAVOR_SHARED == flavor) { - return OMPI_ERR_RMA_SHARED; + return -1; } #if OPAL_CUDA_SUPPORT diff --git a/ompi/mca/osc/sm/osc_sm_component.c b/ompi/mca/osc/sm/osc_sm_component.c index c78dcca3f30..06f3aaf477d 100644 --- a/ompi/mca/osc/sm/osc_sm_component.c +++ b/ompi/mca/osc/sm/osc_sm_component.c @@ -166,33 +166,29 @@ component_finalize(void) } -static int -check_win_ok(ompi_communicator_t *comm, int flavor) -{ - if (! (MPI_WIN_FLAVOR_SHARED == flavor - || MPI_WIN_FLAVOR_ALLOCATE == flavor) ) { - return OMPI_ERR_NOT_SUPPORTED; - } - - if (ompi_group_have_remote_peers (comm->c_local_group)) { - return OMPI_ERR_RMA_SHARED; - } - - return OMPI_SUCCESS; -} - - static int component_query(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor) { int ret; - if (OMPI_SUCCESS != (ret = check_win_ok(comm, flavor))) { - if (OMPI_ERR_NOT_SUPPORTED == ret) { + + /* component only supports shared or allocate flavors */ + if (! (MPI_WIN_FLAVOR_SHARED == flavor || + MPI_WIN_FLAVOR_ALLOCATE == flavor)) { + return -1; + } + + /* If flavor is win_allocate, we can't run if there are remote + * peers in the group. The same check for flavor_shared happens + * in select(), so that we can return an error to the user (since + * we should be able to run for all flavor_shared use cases. + * There's no way to return an error from component_query to the + * user, hence the delayed check. */ + if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { + if (ompi_group_have_remote_peers(comm->c_local_group)) { return -1; } - return ret; } return mca_osc_sm_component.priority; @@ -210,8 +206,10 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit int ret = OMPI_ERROR; size_t memory_alignment = OPAL_ALIGN_MIN; - if (OMPI_SUCCESS != (ret = check_win_ok(comm, flavor))) { - return ret; + assert(MPI_WIN_FLAVOR_SHARED == flavor || MPI_WIN_FLAVOR_ALLOCATE == flavor); + + if (ompi_group_have_remote_peers(comm->c_local_group)) { + return OMPI_ERR_RMA_SHARED; } /* create module structure */ From 7356be9e082bdd82a56f2cc337371a8ffee8bd98 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 7 Feb 2022 20:09:15 +0000 Subject: [PATCH 16/17] class/cstring: Fix handling of corner cases Fix the bool interpretation, which had a bug that made it impossible to interpret the strings "false" or "no" properly. Fix issues uncovered with handling overflow and underflow of the int type when writing the test. Add a test for the opal_cstring interface. Signed-off-by: Brian Barrett (cherry picked from commit 4f422044c8936f51ba95ea1097c494fb4db51c85) --- .gitignore | 1 + opal/class/opal_cstring.c | 46 ++++--- opal/class/opal_cstring.h | 1 - test/class/Makefile.am | 9 +- test/class/opal_cstring.c | 247 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 285 insertions(+), 19 deletions(-) create mode 100644 test/class/opal_cstring.c diff --git a/.gitignore b/.gitignore index 089d60d9614..63e6fbe444b 100644 --- a/.gitignore +++ b/.gitignore @@ -633,6 +633,7 @@ test/class/ompi_rb_tree test/class/ompi_bitmap test/class/opal_bitmap test/class/opal_fifo +test/class/opal_cstring test/class/opal_hash_table test/class/opal_lifo test/class/opal_list diff --git a/opal/class/opal_cstring.c b/opal/class/opal_cstring.c index c5d230d19b1..674c3a6741b 100644 --- a/opal/class/opal_cstring.c +++ b/opal/class/opal_cstring.c @@ -33,7 +33,7 @@ static void opal_cstring_ctor(opal_cstring_t *obj) { *(size_t *) &(obj->length) = 0; /* make sure the string is null-terminated */ - ((char *) obj->string)[0] = '\n'; + ((char *) obj->string)[0] = '\0'; } static inline size_t opal_cstring_alloc_size(size_t len) @@ -90,10 +90,18 @@ int opal_cstring_to_int(opal_cstring_t *string, int *interp) if (*endp != '\0') { return OPAL_ERR_BAD_PARAM; } - /* underflow */ + /* base errors */ if (tmp == 0 && errno == EINVAL) { return OPAL_ERR_BAD_PARAM; } + /* overflow/underflow of long int (return of strtol) */ + if (errno == ERANGE && (tmp == LONG_MIN || tmp == LONG_MAX)) { + return OPAL_ERR_BAD_PARAM; + } + /* overflow/underflow of int (for cases where long int is larger) */ + if (tmp < INT_MIN || tmp > INT_MAX) { + return OPAL_ERR_BAD_PARAM; + } *interp = (int) tmp; @@ -104,24 +112,28 @@ static int opal_str_to_bool_impl(const char *string, bool *interp) { const char *ptr = string; - /* Trim leading whitespace */ - while (isspace(*ptr)) { - ++ptr; - } + if (NULL != string) { + /* Trim leading whitespace */ + while (isspace(*ptr)) { + ++ptr; + } - if ('\0' != *ptr) { - if (isdigit(*ptr)) { - *interp = (bool) atoi(ptr); - } else if (0 == strncasecmp(ptr, "yes", 3) || 0 == strncasecmp(ptr, "true", 4)) { - *interp = true; - } else if (0 == strncasecmp(ptr, "no", 2) && 0 == strncasecmp(ptr, "false", 5)) { - *interp = false; - } else { - *interp = false; - return OPAL_ERR_BAD_PARAM; + if ('\0' != *ptr) { + if (isdigit(*ptr)) { + *interp = (bool) atoi(ptr); + return OPAL_SUCCESS; + } else if (0 == strncasecmp(ptr, "yes", 3) || 0 == strncasecmp(ptr, "true", 4)) { + *interp = true; + return OPAL_SUCCESS; + } else if (0 == strncasecmp(ptr, "no", 2) || 0 == strncasecmp(ptr, "false", 5)) { + *interp = false; + return OPAL_SUCCESS; + } } } - return OPAL_SUCCESS; + + *interp = false; + return OPAL_ERR_BAD_PARAM; } int opal_cstring_to_bool(opal_cstring_t *string, bool *interp) diff --git a/opal/class/opal_cstring.h b/opal/class/opal_cstring.h index a5def2e075f..c92bac70400 100644 --- a/opal/class/opal_cstring.h +++ b/opal/class/opal_cstring.h @@ -43,7 +43,6 @@ #ifndef OPAL_STRING_H #define OPAL_STRING_H -#include "opal_config.h" #include "opal/class/opal_object.h" #include "opal/mca/base/mca_base_var_enum.h" diff --git a/test/class/Makefile.am b/test/class/Makefile.am index ee37245f909..1e39017584b 100644 --- a/test/class/Makefile.am +++ b/test/class/Makefile.am @@ -35,7 +35,8 @@ check_PROGRAMS = \ opal_value_array \ opal_pointer_array \ opal_lifo \ - opal_fifo + opal_fifo \ + opal_cstring TESTS = $(check_PROGRAMS) @@ -94,6 +95,12 @@ opal_fifo_LDADD = \ $(top_builddir)/test/support/libsupport.a opal_fifo_DEPENDENCIES = $(opal_fifo_LDADD) +opal_cstring_SOURCES = opal_cstring.c +opal_cstring_LDADD = \ + $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la \ + $(top_builddir)/test/support/libsupport.a +opal_cstring_DEPENDENCIES = $(opal_cstring_LDADD) + clean-local: rm -f opal_bitmap_test_out.txt opal_hash_table_test_out.txt opal_proc_table_test_out.txt diff --git a/test/class/opal_cstring.c b/test/class/opal_cstring.c new file mode 100644 index 00000000000..d59c3573f48 --- /dev/null +++ b/test/class/opal_cstring.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include + +#include "opal/class/opal_cstring.h" +#include +#include + +#define CHECK_INT(actual, expected) \ + if (expected != actual) { \ + printf("%s:%d: expected %d, got %d\n", \ + __FILE__, __LINE__, expected, actual); \ + exit(1); \ + } + +#define CHECK_BOOL(actual, expected) \ + if (expected != actual) { \ + printf("%s:%d: expected %s, got %s\n", \ + __FILE__, __LINE__, \ + expected ? "true" : "false", \ + actual ? "true" : "false"); \ + exit(1); \ + } + +int main(int argc, char *argv[]) +{ + int ret, int_val; + bool bool_val; + opal_cstring_t *cstr; + + /* + * bool tests + */ + cstr = opal_cstring_create("true"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, true); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("yes"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, true); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("false"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, false); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("no"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, false); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("1"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, true); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("0"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, false); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("1.0"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, true); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("0.0"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_BOOL(bool_val, false); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("foobar"); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create(NULL); + ret = opal_cstring_to_bool(cstr, &bool_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + + /* + * bool string tests + */ + bool_val = opal_str_to_bool("true"); + CHECK_BOOL(bool_val, true); + + bool_val = opal_str_to_bool("yes"); + CHECK_BOOL(bool_val, true); + + bool_val = opal_str_to_bool("false"); + CHECK_BOOL(bool_val, false); + + bool_val = opal_str_to_bool("no"); + CHECK_BOOL(bool_val, false); + + bool_val = opal_str_to_bool("1"); + CHECK_BOOL(bool_val, true); + + bool_val = opal_str_to_bool("0"); + CHECK_BOOL(bool_val, false); + + bool_val = opal_str_to_bool("1.0"); + CHECK_BOOL(bool_val, true); + + bool_val = opal_str_to_bool("0.0"); + CHECK_BOOL(bool_val, false); + + bool_val = opal_str_to_bool("foobar"); + CHECK_BOOL(bool_val, false); + + bool_val = opal_str_to_bool(NULL); + CHECK_BOOL(bool_val, false); + + /* + * integer tests + */ + cstr = opal_cstring_create("1"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_INT(int_val, 1); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("0"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_SUCCESS); + CHECK_INT(int_val, 0); + OBJ_RELEASE(cstr); + + /* test intentional overflow cases */ + if (sizeof(long) > sizeof(int)) { + long input = (long)INT_MAX + 1L; + char *input_str; + + ret = asprintf(&input_str, "%ld", input); + if (ret < 0) { + printf("%s:%d: asprintf()", __FILE__, __LINE__); + exit(1); + } + + cstr = opal_cstring_create(input_str); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + free(input_str); + } + + /* yes, this may be redundant from above, but always try both + cases for simplicity */ + if (sizeof(long long) > sizeof(int)) { + long long input = (long long)INT_MAX + 1L; + char *input_str; + + ret = asprintf(&input_str, "%lld", input); + if (ret < 0) { + printf("%s:%d: asprintf()", __FILE__, __LINE__); + exit(1); + } + + cstr = opal_cstring_create(input_str); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + free(input_str); + } + + { + long input = (long)LONG_MAX; + char *input_str; + + ret = asprintf(&input_str, "%ld0", input); + if (ret < 0) { + printf("%s:%d: asprintf()", __FILE__, __LINE__); + exit(1); + } + + cstr = opal_cstring_create(input_str); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + free(input_str); + } + + + cstr = opal_cstring_create("1.0"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("0.0"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("foobar"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create(NULL); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("true"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("yes"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("false"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + cstr = opal_cstring_create("no"); + ret = opal_cstring_to_int(cstr, &int_val); + CHECK_INT(ret, OPAL_ERR_BAD_PARAM); + OBJ_RELEASE(cstr); + + return 0; +} From 3347767a8555e5f6a88e84a8a1b2d0ac508bb8df Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 1 Mar 2022 17:09:25 +0000 Subject: [PATCH 17/17] osc/rdma: Fix priority setting bug In a previous commit, I accidently set the priority to always be 0. Which is wrong, but had no practical impact, given the priorities and availabilities of other OSC components. However, it was wrong and this patch fixes the issue. Signed-off-by: Brian Barrett (cherry picked from commit cf253bc4293b76fdbf1bc180fec8e7edde2d269d) --- ompi/mca/osc/rdma/osc_rdma_component.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 02cf8767a71..c98eb549a2b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -374,7 +374,6 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor) { - if (MPI_WIN_FLAVOR_SHARED == flavor) { return -1; } @@ -395,7 +394,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s return -1; } - return OMPI_SUCCESS;; + return mca_osc_rdma_component.priority; } static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) {