diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index f194eb2a197..2a8aeae156d 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -16,6 +16,8 @@ * Copyright (c) 2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2020-2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -108,9 +110,6 @@ struct ompi_osc_rdma_component_t { /** Priority of the osc/rdma component */ unsigned int priority; - /** Priority of the osc/rdma component when using non-RDMA BTLs */ - unsigned int alternate_priority; - /** directory where to place backing files */ char *backing_directory; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 12937ffc644..42e93287225 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -18,7 +18,8 @@ * Copyright (c) 2015-2017 Intel, Inc. All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020-2021 Google, LLC. All rights reserved. @@ -77,14 +78,12 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct opal_info_t *info, int flavor, int *model); -static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module); +static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module); static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module); -static int ompi_osc_rdma_query_mtls (void); static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value); -static char *ompi_osc_rdma_btl_names; -static char *ompi_osc_rdma_mtl_names; +static char *ompi_osc_rdma_full_connectivity_btls; static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { @@ -239,14 +238,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.priority); free(description_str); - mca_osc_rdma_component.alternate_priority = 37; - opal_asprintf(&description_str, "Priority of the osc/rdma component when using non-RDMA btls (default: %d)", - mca_osc_rdma_component.alternate_priority); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_priority", description_str, - MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.alternate_priority); - free(description_str); - (void) mca_base_var_enum_create ("osc_rdma_locking_mode", ompi_osc_rdma_locking_modes, &new_enum); mca_osc_rdma_component.locking_mode = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL; @@ -256,14 +247,14 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.locking_mode); OBJ_RELEASE(new_enum); - ompi_osc_rdma_btl_names = "ugni,uct"; + ompi_osc_rdma_full_connectivity_btls = "ugni,uct,ofi"; opal_asprintf(&description_str, "Comma-delimited list of BTL component names to allow without verifying " "connectivity. Do not add a BTL to to this list unless it can reach all " "processes in any communicator used with an MPI window (default: %s)", - ompi_osc_rdma_btl_names); + ompi_osc_rdma_full_connectivity_btls); (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "btls", description_str, MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names); + MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls); free(description_str); ompi_osc_rdma_btl_alternate_names = "sm,tcp"; @@ -274,14 +265,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); free(description_str); - ompi_osc_rdma_mtl_names = "psm2"; - opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma " - "osc component (default: %s)", ompi_osc_rdma_mtl_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "mtls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_mtl_names); - free(description_str); - if (0 == access ("/dev/shm", W_OK)) { mca_osc_rdma_component.backing_directory = "/dev/shm"; } else { @@ -412,19 +395,15 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s } #endif /* OPAL_CUDA_SUPPORT */ - if (OMPI_SUCCESS == ompi_osc_rdma_query_mtls ()) { - return 5; - } - - if (OMPI_SUCCESS == ompi_osc_rdma_query_btls (comm, NULL)) { + if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm, NULL)) { return mca_osc_rdma_component.priority; } if (OMPI_SUCCESS == ompi_osc_rdma_query_alternate_btls (comm, NULL)) { - return mca_osc_rdma_component.alternate_priority; + return mca_osc_rdma_component.priority; } - return mca_osc_rdma_component.priority; + return OMPI_ERROR; } static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) { @@ -864,23 +843,6 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s return ret; } -static int ompi_osc_rdma_query_mtls (void) -{ - char **mtls_to_use; - - mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names, ','); - if (mtls_to_use && ompi_mtl_base_selected_component) { - for (int i = 0 ; mtls_to_use[i] ; ++i) { - if (0 == strcmp (mtls_to_use[i], ompi_mtl_base_selected_component->mtl_version.mca_component_name)) { - opal_argv_free(mtls_to_use); - return OMPI_SUCCESS; - } - } - } - opal_argv_free(mtls_to_use); - return -1; -} - /** * @brief ensure that all local procs are added to the bml * @@ -919,12 +881,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL that does not provide true RDMA but - * can use active messages using the BTL base AM RDMA/atomics. Since more than - * one BTL may be needed for this support the OSC component will disable the - * use of registration-based RDMA (these BTLs will not be used) and will use - * any remaining BTL. By default the BTLs used will be tcp and sm but any single - * (or pair) of BTLs may be used. + * In this case an "alternate" BTL is a BTL does not meet the + * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). + * Either it does not provide connectivity to all peers, provide + * remote completion, or natively support put/get/atomic.. Since more + * than one BTL may be needed for this support the OSC component will + * disable the use of registration-based RDMA (these BTLs will not be + * used) and will use any remaining BTL. By default the BTLs used will + * be tcp and sm but any single (or pair) of BTLs may be used. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { @@ -973,20 +937,46 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; } -static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) +/* Check for BTL requirements: + * 1) RDMA (put/get) and ATOMIC operations. We only require cswap + * and fetch and add and will emulate other opterations with those + * two as necessary. + * 2) Remote Completion + */ +static bool ompi_osc_rdma_check_accelerated_btl(struct mca_btl_base_module_t *btl) +{ + return ((btl->btl_flags & MCA_BTL_FLAGS_RDMA) && + (btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_FOPS) && + (btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION) && + (btl->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_ADD)); +} + +/* + * Attempt to find a BTL that can be used for native RDMA + * + * Attempt to find an "accelerated" BTL that can be used directly, as + * opposed to emulated rdma semantics with the alternate BTLs. To be + * an accelerated BTL, four conditions must be true: + * + * 1) The BTL must be able to communicate with all peers in the + * Window + * 2) The BTL must provide remote completion + * 3) The BTL must be able to register the entire target window + * 4) The BTL must natively support put/get/atomic operations + * + * Testing (1) is expensive, so as an optimization, the + * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL + * components we know can achieve (1) in almost all usage scenarios. + * + * If module is NULL, the code acts as a query mechanism to find any + * potential BTLs, and is used to implement osc_rdma_query(). + */ +static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { - struct mca_btl_base_module_t **possible_btls = NULL; int comm_size = ompi_comm_size (comm); - int comm_rank = ompi_comm_rank (comm); - int rc = OMPI_SUCCESS, max_btls = 0; - unsigned int selected_latency = INT_MAX; - struct mca_btl_base_module_t *selected_btl = NULL; - mca_btl_base_selected_module_t *item; - int *btl_counts = NULL; + struct mca_btl_base_module_t *selected_btl; + mca_bml_base_endpoint_t *base_endpoint; char **btls_to_use; - void *tmp; - - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_names, ','); if (module) { ompi_osc_rdma_selected_btl_insert(module, NULL, 0); @@ -994,7 +984,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo module->use_memory_registration = false; } + /* Check for BTLs in the list of BTLs we know can reach all peers + in general usage. */ + btls_to_use = opal_argv_split (ompi_osc_rdma_full_connectivity_btls, ','); if (btls_to_use) { + mca_btl_base_selected_module_t *item; + + selected_btl = NULL; + /* rdma and atomics are only supported with BTLs at the moment */ OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { for (int i = 0 ; btls_to_use[i] ; ++i) { @@ -1002,9 +999,8 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo continue; } - if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA && - (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) { - if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) { + if (ompi_osc_rdma_check_accelerated_btl(item->btl_module)) { + if (NULL == selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) { selected_btl = item->btl_module; } } @@ -1012,126 +1008,92 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo } opal_argv_free (btls_to_use); - } - if (NULL != selected_btl) { - if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; + if (NULL != selected_btl) { + goto btl_selection_complete; } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s", - selected_btl->btl_component->btl_version.mca_component_name); - return OMPI_SUCCESS; } /* if osc/rdma gets selected we need to ensure that all local procs have been added */ ompi_osc_rdma_ensure_local_add_procs (); - - for (int rank = 0 ; rank < comm_size ; ++rank) { - ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank); - mca_bml_base_endpoint_t *endpoint; - int num_btls, prev_max; - bool found_btl = false; - - endpoint = mca_bml_base_get_endpoint (proc); - if (NULL == endpoint) { - /* can't continue if some peer is unreachable */ - rc = OMPI_ERR_UNREACH; - break; - } - - num_btls = mca_bml_base_btl_array_get_size (&endpoint->btl_rdma); - if (0 == num_btls) { - rc = OMPI_ERR_NOT_AVAILABLE; - /* at least one rank doesn't have an RDMA capable btl */ - break; - } - prev_max = max_btls; + /* + * A BTL in the list of known can reach all peers that met our + * other requirements was not found. Look for BTLs that may be + * able to talk to all peers. This is obviously more expensive + * than the check above. + * + * This algorithm skips a potential use case: it requires + * reachability to self, which is not strictly needed if BTL and + * CPU atomics are atomic with each other. However, the set of + * BTLs which can not send to self, which have RDMA semantics, an + * which have the rquired atomicity is currently the null set and + * almost certain to remain the null set, so we keep it simple. + * + * We only want BTLs that can reach all peers, so use rank 0's BTL + * list as the list of all available BTLs. Any BTL that cannot + * be used to communicate with rank 0 necessarily is not in the + * list of all available BTLs for this algorithm. + */ + base_endpoint = mca_bml_base_get_endpoint(ompi_comm_peer_lookup(comm, 0)); + if (NULL == base_endpoint) { + return OMPI_ERR_UNREACH; + } - max_btls = (max_btls > num_btls) ? max_btls : num_btls; + selected_btl = NULL; + for (size_t i_btl = 0 ; + i_btl < mca_bml_base_btl_array_get_size(&base_endpoint->btl_rdma); + ++i_btl) { + bool have_connectivity = true; + struct mca_bml_base_btl_t *examine_bml_btl; + struct mca_btl_base_module_t *examine_btl; - tmp = realloc (possible_btls, sizeof (void *) * max_btls); - if (NULL == tmp) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - break; + examine_bml_btl = mca_bml_base_btl_array_get_index(&base_endpoint->btl_rdma, i_btl); + if (NULL == examine_bml_btl) { + return OMPI_ERR_NOT_FOUND; } - possible_btls = tmp; + examine_btl = examine_bml_btl->btl; - for (int j = prev_max ; j < max_btls ; ++j) { - possible_btls[j] = NULL; + /* skip any BTL which doesn't meet our requirements */ + if (!ompi_osc_rdma_check_accelerated_btl(examine_btl)) { + continue; } - tmp = realloc (btl_counts, sizeof (int) * max_btls); - if (NULL == tmp) { - rc = OMPI_ERR_OUT_OF_RESOURCE; - break; - } - btl_counts = tmp; - - for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) { - /* for this implementation we need only compare-and-swap and fetch-and-add */ - if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) == - (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags & - MCA_BTL_ATOMIC_SUPPORTS_ADD)) { - for (int j = 0 ; j < max_btls ; ++j) { - if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) { - ++btl_counts[j]; - found_btl = true; - break; - } else if (NULL == possible_btls[j]) { - possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl; - btl_counts[j] = 1; - found_btl = true; - break; - } - } - } - } + /* check connectivity across all ranks */ + for (int rank = 0 ; rank < comm_size ; ++rank) { + ompi_proc_t *proc = ompi_comm_peer_lookup(comm, rank); + mca_bml_base_endpoint_t *endpoint; - /* any non-local rank must have a usable btl */ - if (!found_btl && comm_rank != rank) { - /* no btl = no rdma/atomics */ - rc = OMPI_ERR_UNREACH; - break; - } - } - - if (OMPI_SUCCESS != rc) { - free (possible_btls); - free (btl_counts); - return rc; - } - - for (int i = 0 ; i < max_btls ; ++i) { - int btl_count = btl_counts[i]; - - if (NULL == possible_btls[i]) { - break; - } + endpoint = mca_bml_base_get_endpoint(proc); + if (NULL == endpoint) { + have_connectivity = false; + break; + } - if (possible_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) { - /* do not need to use the btl for self communication */ - btl_count++; + if (NULL == mca_bml_base_btl_array_find(&endpoint->btl_rdma, + examine_btl)) { + have_connectivity = false; + break; + } } - if (btl_count >= comm_size && possible_btls[i]->btl_latency < selected_latency) { - selected_btl = possible_btls[i]; - selected_latency = possible_btls[i]->btl_latency; + /* if we have connectivity, displace currently selected btl if + * this one has lower latency; we prioritize latency over all + * other parameters + */ + if (have_connectivity) { + if (NULL == selected_btl || examine_btl->btl_latency < selected_btl->btl_latency) { + selected_btl = examine_btl; + } } } - free (possible_btls); - free (btl_counts); - if (NULL == selected_btl) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found"); - /* no btl = no rdma/atomics */ return OMPI_ERR_NOT_AVAILABLE; } +btl_selection_complete: if (module) { ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); module->btls_in_use = 1; @@ -1408,7 +1370,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } /* find rdma capable endpoints */ - ret = ompi_osc_rdma_query_btls (module->comm, module); + ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not find a suitable btl. falling back on " "active-message BTLs"); diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index e014eb05a82..2b1e3400195 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2020-2021 Google, LLC. All rights reserved. * Copyright (c) 2021-2022 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -11,18 +13,27 @@ * $HEADER$ */ +#include "opal_config.h" + +#include + #include "opal/mca/btl/base/btl_base_am_rdma.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/threads/mutex.h" +#include "opal/util/minmax.h" + /** - * @brief data for active-message atomics + * @brief global data for active message wrapper * - * There is currently only one module but it is defined to allow - * moving the data pointer into the associated BTL module. + * While individual entries in queued_responses and + * queued_initiator_descriptors are module-specific (ie, per BTL + * module), they are progressed in a common progress function. It is + * much more efficient to have one list of work to do, rather than + * having to poll through all active btls to find the work to do. */ -struct mca_btl_base_am_rdma_module_t { +struct am_rdma_component_t { opal_object_t super; /** provides protection for multi-threaded access to module members */ opal_mutex_t mutex; @@ -31,29 +42,36 @@ struct mca_btl_base_am_rdma_module_t { /** queued initiator descriptors */ opal_list_t queued_initiator_descriptors; }; -typedef struct mca_btl_base_am_rdma_module_t mca_btl_base_am_rdma_module_t; +typedef struct am_rdma_component_t am_rdma_component_t; -static void mca_btl_base_am_rdma_module_init(mca_btl_base_am_rdma_module_t *module) +static am_rdma_component_t default_component; + +static void am_rdma_component_init(am_rdma_component_t *component) { - OBJ_CONSTRUCT(&module->mutex, opal_mutex_t); - OBJ_CONSTRUCT(&module->queued_responses, opal_list_t); - OBJ_CONSTRUCT(&module->queued_initiator_descriptors, opal_list_t); + OBJ_CONSTRUCT(&component->mutex, opal_mutex_t); + OBJ_CONSTRUCT(&component->queued_responses, opal_list_t); + OBJ_CONSTRUCT(&component->queued_initiator_descriptors, opal_list_t); } -static void mca_btl_base_am_rdma_module_fini(mca_btl_base_am_rdma_module_t *module) +static void am_rdma_component_fini(am_rdma_component_t *component) { - OBJ_DESTRUCT(&module->mutex); - OBJ_DESTRUCT(&module->queued_responses); - OBJ_DESTRUCT(&module->queued_initiator_descriptors); + OBJ_DESTRUCT(&component->mutex); + OBJ_DESTRUCT(&component->queued_responses); + OBJ_DESTRUCT(&component->queued_initiator_descriptors); } -static OBJ_CLASS_INSTANCE(mca_btl_base_am_rdma_module_t, opal_object_t, - mca_btl_base_am_rdma_module_init, mca_btl_base_am_rdma_module_fini); +static OBJ_CLASS_INSTANCE(am_rdma_component_t, opal_object_t, + am_rdma_component_init, am_rdma_component_fini); + + +OBJ_CLASS_INSTANCE(mca_btl_base_am_rdma_module_t, opal_object_t, + NULL, NULL); + /** * @brief response header for an active-message RDMA/atomic operation */ -struct mca_btl_base_rdma_response_hdr_t { +struct am_rdma_response_hdr_t { /** context for the response */ uint64_t context; /** initiator address */ @@ -63,12 +81,13 @@ struct mca_btl_base_rdma_response_hdr_t { /** response data may follow. the size is implied by the size of the incoming * descriptor */ }; -typedef struct mca_btl_base_rdma_response_hdr_t mca_btl_base_rdma_response_hdr_t; +typedef struct am_rdma_response_hdr_t am_rdma_response_hdr_t; + /** * @brief type of active-message RDMA/atomic operation */ -enum mca_btl_base_rdma_type_t { +enum am_rdma_type_t { /** active-message put. May be implemented with send/recv or RDMA get * depending on the functions that the BTL implements. */ MCA_BTL_BASE_AM_PUT, @@ -80,12 +99,13 @@ enum mca_btl_base_rdma_type_t { /** compare-and-swap */ MCA_BTL_BASE_AM_CAS, }; -typedef enum mca_btl_base_rdma_type_t mca_btl_base_rdma_type_t; +typedef enum am_rdma_type_t am_rdma_type_t; + /** * @brief origin-side operation context for an active-message RDMA/atomic operation */ -struct mca_btl_base_rdma_context_t { +struct am_rdma_context_t { opal_object_t super; /** operation type */ uint8_t type; @@ -110,35 +130,37 @@ struct mca_btl_base_rdma_context_t { /** local handle for this request */ struct mca_btl_base_registration_handle_t *local_handle; }; -typedef struct mca_btl_base_rdma_context_t mca_btl_base_rdma_context_t; +typedef struct am_rdma_context_t am_rdma_context_t; -static void mca_btl_base_rdma_context_init(mca_btl_base_rdma_context_t *context) +static void am_rdma_context_init(am_rdma_context_t *context) { context->sent = 0; context->acknowledged = 0; context->descriptor = NULL; } -static OBJ_CLASS_INSTANCE(mca_btl_base_rdma_context_t, opal_object_t, - mca_btl_base_rdma_context_init, NULL); +static OBJ_CLASS_INSTANCE(am_rdma_context_t, opal_object_t, + am_rdma_context_init, NULL); + /** * @brief queued initiator descriptor */ -struct mca_btl_base_am_rdma_queued_descriptor_t { +struct am_rdma_queued_descriptor_t { opal_list_item_t super; - mca_btl_base_module_t *btl; + mca_btl_base_am_rdma_module_t *am_module; struct mca_btl_base_endpoint_t *endpoint; mca_btl_base_descriptor_t *descriptor; }; -typedef struct mca_btl_base_am_rdma_queued_descriptor_t mca_btl_base_am_rdma_queued_descriptor_t; +typedef struct am_rdma_queued_descriptor_t am_rdma_queued_descriptor_t; + +static OBJ_CLASS_INSTANCE(am_rdma_queued_descriptor_t, opal_list_item_t, NULL, NULL); -static OBJ_CLASS_INSTANCE(mca_btl_base_am_rdma_queued_descriptor_t, opal_list_item_t, NULL, NULL); /** * @brief header for an active-message atomic/RDMA operation */ -struct mca_btl_base_rdma_hdr_t { +struct am_rdma_hdr_t { /** type of operation requested. */ uint8_t type; uint8_t padding[3]; @@ -177,12 +199,12 @@ struct mca_btl_base_rdma_hdr_t { * to the initiator */ uint64_t context; }; -typedef struct mca_btl_base_rdma_hdr_t mca_btl_base_rdma_hdr_t; +typedef struct am_rdma_hdr_t am_rdma_hdr_t; /** * @brief target-side RDMA/atomic operation */ -struct mca_btl_base_rdma_operation_t { +struct am_rdma_operation_t { /** these may be stored in lists */ opal_list_item_t super; /** btl module associated with this operation */ @@ -194,7 +216,7 @@ struct mca_btl_base_rdma_operation_t { * needs to be retried. */ mca_btl_base_descriptor_t *descriptor; /** incoming operation header */ - mca_btl_base_rdma_hdr_t hdr; + am_rdma_hdr_t hdr; /** local memory handle (if using RDMA) */ uint8_t local_handle_data[MCA_BTL_REG_HANDLE_MAX_SIZE]; /** remote memory handle (if using RMDA) */ @@ -206,45 +228,32 @@ struct mca_btl_base_rdma_operation_t { /** rdma operation was completed (waiting response) */ bool is_completed; }; -typedef struct mca_btl_base_rdma_operation_t mca_btl_base_rdma_operation_t; - -static OBJ_CLASS_INSTANCE(mca_btl_base_rdma_operation_t, opal_list_item_t, NULL, NULL); +typedef struct am_rdma_operation_t am_rdma_operation_t; -static inline size_t size_t_min(size_t a, size_t b) -{ - return (a < b) ? a : b; -} +static OBJ_CLASS_INSTANCE(am_rdma_operation_t, opal_list_item_t, NULL, NULL); -static mca_btl_base_am_rdma_module_t default_module; -static inline bool mca_btl_base_rdma_use_rdma_get(mca_btl_base_module_t *btl) +static inline bool am_rdma_is_atomic(am_rdma_type_t type) { - return !!(btl->btl_flags & MCA_BTL_FLAGS_GET); + return (MCA_BTL_BASE_AM_PUT != type && MCA_BTL_BASE_AM_GET != type); } -static inline bool mca_btl_base_rdma_use_rdma_put(mca_btl_base_module_t *btl) -{ - return !!(btl->btl_flags & MCA_BTL_FLAGS_PUT); -} -static inline bool mca_btl_base_rdma_is_atomic(mca_btl_base_rdma_type_t type) +static inline size_t am_rdma_operation_size(mca_btl_base_am_rdma_module_t *am_module, + am_rdma_type_t type, + size_t remaining) { - return (MCA_BTL_BASE_AM_PUT != type && MCA_BTL_BASE_AM_GET != type); -} + mca_btl_base_module_t *btl = am_module->btl; -static inline size_t mca_btl_base_rdma_operation_size(mca_btl_base_module_t *btl, - mca_btl_base_rdma_type_t type, - size_t remaining) -{ switch (type) { case MCA_BTL_BASE_AM_PUT: - if (mca_btl_base_rdma_use_rdma_get(btl)) { - return size_t_min(remaining, btl->btl_get_limit); + if (am_module->use_rdma_get) { + return opal_min(remaining, btl->btl_get_limit); } break; case MCA_BTL_BASE_AM_GET: - if (mca_btl_base_rdma_use_rdma_put(btl)) { - return size_t_min(remaining, btl->btl_put_limit); + if (am_module->use_rdma_put) { + return opal_min(remaining, btl->btl_put_limit); } break; case MCA_BTL_BASE_AM_ATOMIC: @@ -253,10 +262,11 @@ static inline size_t mca_btl_base_rdma_operation_size(mca_btl_base_module_t *btl return remaining; } - return size_t_min(remaining, btl->btl_max_send_size - sizeof(mca_btl_base_rdma_hdr_t)); + return opal_min(remaining, btl->btl_max_send_size - sizeof(am_rdma_hdr_t)); } -static inline int mca_btl_base_rdma_tag(mca_btl_base_rdma_type_t type) + +static inline int am_rdma_tag(am_rdma_type_t type) { (void) type; switch (type) { @@ -270,11 +280,13 @@ static inline int mca_btl_base_rdma_tag(mca_btl_base_rdma_type_t type) return MCA_BTL_BASE_TAG_RDMA_RESP; } -static inline int mca_btl_base_rdma_resp_tag(void) + +static inline int am_rdma_resp_tag(void) { return MCA_BTL_BASE_TAG_RDMA_RESP; } + /** * @brief copy data from a segment to a local address * @@ -283,9 +295,9 @@ static inline int mca_btl_base_rdma_resp_tag(void) * @in segments segments to copy data from * @in segment_count number of segments */ -static void mca_btl_base_copy_from_segments(uint64_t addr, size_t skip_bytes, - const mca_btl_base_segment_t *segments, - size_t segment_count) +static void am_rdma_copy_from_segments(uint64_t addr, size_t skip_bytes, + const mca_btl_base_segment_t *segments, + size_t segment_count) { const void *seg0_data = (const void *) ((uintptr_t) segments[0].seg_addr.pval + skip_bytes); size_t seg0_len = segments[0].seg_len - skip_bytes; @@ -306,6 +318,7 @@ static void mca_btl_base_copy_from_segments(uint64_t addr, size_t skip_bytes, } } + /** * @brief copy data from a local address into a segment * @@ -314,11 +327,11 @@ static void mca_btl_base_copy_from_segments(uint64_t addr, size_t skip_bytes, * @in segments segments to copy data to * @in segment_count number of segments */ -static void mca_btl_base_copy_to_segments(uint64_t addr, size_t max_len, size_t skip_bytes, - mca_btl_base_segment_t *segments, size_t segment_count) +static void am_rdma_copy_to_segments(uint64_t addr, size_t max_len, size_t skip_bytes, + mca_btl_base_segment_t *segments, size_t segment_count) { void *seg0_data = (void *) ((uintptr_t) segments[0].seg_addr.pval + skip_bytes); - size_t seg0_len = size_t_min(max_len, segments[0].seg_len - skip_bytes); + size_t seg0_len = opal_min(max_len, segments[0].seg_len - skip_bytes); if (seg0_len > 0) { BTL_VERBOSE( @@ -330,7 +343,7 @@ static void mca_btl_base_copy_to_segments(uint64_t addr, size_t max_len, size_t } for (size_t i = 1; i < segment_count && max_len; ++i) { - size_t seg_len = size_t_min(segments[i].seg_len, max_len); + size_t seg_len = opal_min(segments[i].seg_len, max_len); BTL_VERBOSE(("packing %" PRIsize_t " bytes from 0x%" PRIx64 " to segment %" PRIsize_t, seg_len, addr, i)); @@ -343,27 +356,29 @@ static void mca_btl_base_copy_to_segments(uint64_t addr, size_t max_len, size_t } } -static void mca_btl_base_am_queue_initiator_descriptor(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t *descriptor) + +static void am_rdma_queue_initiator_descriptor(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *descriptor) { - mca_btl_base_am_rdma_queued_descriptor_t *queued_descriptor = OBJ_NEW( - mca_btl_base_am_rdma_queued_descriptor_t); + am_rdma_queued_descriptor_t *queued_descriptor = OBJ_NEW(am_rdma_queued_descriptor_t); - queued_descriptor->btl = btl; + queued_descriptor->am_module = am_module; queued_descriptor->endpoint = endpoint; queued_descriptor->descriptor = descriptor; - OPAL_THREAD_SCOPED_LOCK(&default_module.mutex, - opal_list_append(&default_module.queued_initiator_descriptors, + OPAL_THREAD_SCOPED_LOCK(&default_component.mutex, + opal_list_append(&default_component.queued_initiator_descriptors, &queued_descriptor->super)); } -static inline int mca_btl_base_am_rdma_advance(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_rdma_context_t *context, - bool send_descriptor) + +static inline int am_rdma_advance(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, + am_rdma_context_t *context, + bool send_descriptor) { + mca_btl_base_module_t *btl = am_module->btl; int ret; const size_t remaining = context->total_size - context->sent; @@ -380,14 +395,16 @@ static inline int mca_btl_base_am_rdma_advance(mca_btl_base_module_t *btl, } mca_btl_base_descriptor_t *descriptor = context->descriptor; - mca_btl_base_rdma_hdr_t *hdr = (mca_btl_base_rdma_hdr_t *) descriptor->des_segments[0] + am_rdma_hdr_t *hdr = (am_rdma_hdr_t *) descriptor->des_segments[0] .seg_addr.pval; - const size_t packet_size = mca_btl_base_rdma_operation_size(btl, hdr->type, remaining); + const size_t packet_size = am_rdma_operation_size(am_module, hdr->type, remaining); - if (!mca_btl_base_rdma_is_atomic(hdr->type)) { + if (!am_rdma_is_atomic(hdr->type)) { hdr->data.rdma.size = packet_size; hdr->data.rdma.initiator_address = (uint64_t) context->local_address + context->sent; } else { + /* atomics today are single datatype entries */ + assert(packet_size < UINT8_MAX); hdr->data.atomic.size = packet_size; } @@ -397,13 +414,13 @@ static inline int mca_btl_base_am_rdma_advance(mca_btl_base_module_t *btl, if (MCA_BTL_BASE_AM_PUT == hdr->type && !hdr->data.rdma.use_rdma) { /* copy the next block into the fragment buffer */ - mca_btl_base_copy_to_segments(hdr->data.rdma.initiator_address, packet_size, sizeof(*hdr), - descriptor->des_segments, descriptor->des_segment_count); + am_rdma_copy_to_segments(hdr->data.rdma.initiator_address, packet_size, sizeof(*hdr), + descriptor->des_segments, descriptor->des_segment_count); } if (send_descriptor) { assert(0 != (descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)); - ret = btl->btl_send(btl, endpoint, descriptor, mca_btl_base_rdma_tag(hdr->type)); + ret = btl->btl_send(btl, endpoint, descriptor, am_rdma_tag(hdr->type)); if (ret == 1) { ret = OPAL_SUCCESS; } @@ -411,32 +428,36 @@ static inline int mca_btl_base_am_rdma_advance(mca_btl_base_module_t *btl, } /* queue for later to avoid btl_send in callback */ - mca_btl_base_am_queue_initiator_descriptor(btl, endpoint, descriptor); + am_rdma_queue_initiator_descriptor(am_module, endpoint, descriptor); return OPAL_SUCCESS; } -static void mca_btl_base_am_descriptor_complete(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t *descriptor, int status) + +static void am_rdma_descriptor_complete(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *descriptor, int status) { - (void) mca_btl_base_am_rdma_advance(btl, endpoint, - (mca_btl_base_rdma_context_t *) descriptor->des_context, - /*send_descriptor=*/false); + mca_btl_base_am_rdma_module_t *am_module = (mca_btl_base_am_rdma_module_t *)descriptor->des_cbdata; + + (void) am_rdma_advance(am_module, endpoint, + (am_rdma_context_t *) descriptor->des_context, + /*send_descriptor=*/false); } -static inline int -mca_btl_base_rdma_start(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - int type, uint64_t operand1, uint64_t operand2, mca_btl_base_atomic_op_t op, - int order, int flags, size_t size, void *local_address, - mca_btl_base_registration_handle_t *local_handle, int64_t remote_address, - mca_btl_base_registration_handle_t *remote_handle, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) + +static inline int am_rdma_start(mca_btl_base_am_rdma_module_t *am_module, struct mca_btl_base_endpoint_t *endpoint, + int type, uint64_t operand1, uint64_t operand2, mca_btl_base_atomic_op_t op, + int order, int flags, size_t size, void *local_address, + mca_btl_base_registration_handle_t *local_handle, int64_t remote_address, + mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_rdma_hdr_t *hdr; + mca_btl_base_module_t *btl = am_module->btl; + am_rdma_hdr_t *hdr; size_t packet_size = sizeof(*hdr); mca_btl_base_descriptor_t *descriptor; - mca_btl_base_rdma_context_t *context = OBJ_NEW(mca_btl_base_rdma_context_t); + am_rdma_context_t *context = OBJ_NEW(am_rdma_context_t); if (OPAL_UNLIKELY(NULL == context)) { return OPAL_ERR_OUT_OF_RESOURCE; @@ -457,14 +478,14 @@ mca_btl_base_rdma_start(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint if (sizeof(*hdr) + size <= btl->btl_eager_limit) { /* just go ahead and send the data */ packet_size += size; - } else if (!mca_btl_base_rdma_use_rdma_get (btl)) { - packet_size += size_t_min (size, btl->btl_max_send_size - sizeof (*hdr)); + } else if (!am_module->use_rdma_get) { + packet_size += opal_min (size, btl->btl_max_send_size - sizeof (*hdr)); } else { use_rdma = true; } } else if (MCA_BTL_BASE_AM_GET == type) { - if (!mca_btl_base_rdma_use_rdma_put(btl)) { - packet_size += size_t_min(size, btl->btl_max_send_size - sizeof(*hdr)); + if (!am_module->use_rdma_put) { + packet_size += opal_min(size, btl->btl_max_send_size - sizeof(*hdr)); } else { use_rdma = true; } @@ -494,14 +515,14 @@ mca_btl_base_rdma_start(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint * be released on response before the descriptor callback has completed. */ OBJ_RETAIN(context); - descriptor->des_cbfunc = mca_btl_base_am_descriptor_complete; - descriptor->des_cbdata = local_handle; + descriptor->des_cbfunc = am_rdma_descriptor_complete; + descriptor->des_cbdata = am_module; descriptor->des_context = context; - hdr = (mca_btl_base_rdma_hdr_t *) descriptor->des_segments[0].seg_addr.pval; + hdr = (am_rdma_hdr_t *) descriptor->des_segments[0].seg_addr.pval; hdr->type = type; - if (!mca_btl_base_rdma_is_atomic(type)) { + if (!am_rdma_is_atomic(type)) { hdr->data.rdma.use_rdma = use_rdma; } else { hdr->data.atomic.op = op; @@ -518,14 +539,16 @@ mca_btl_base_rdma_start(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint memcpy(handle_buffer, remote_handle, btl->btl_registration_handle_size); } - return mca_btl_base_am_rdma_advance(btl, endpoint, context, /*send_descriptor=*/true); + return am_rdma_advance(am_module, endpoint, context, /*send_descriptor=*/true); } -static mca_btl_base_rdma_operation_t *mca_btl_base_rdma_alloc_operation( - mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t *descriptor, const mca_btl_base_rdma_hdr_t *hdr) + +static am_rdma_operation_t *am_rdma_alloc_operation(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *descriptor, + const am_rdma_hdr_t *hdr) { - mca_btl_base_rdma_operation_t *operation = OBJ_NEW(mca_btl_base_rdma_operation_t); + am_rdma_operation_t *operation = OBJ_NEW(am_rdma_operation_t); if (NULL == operation) { return NULL; } @@ -537,7 +560,7 @@ static mca_btl_base_rdma_operation_t *mca_btl_base_rdma_alloc_operation( operation->is_queued = false; memcpy(&operation->hdr, hdr, sizeof(*hdr)); - if (!mca_btl_base_rdma_is_atomic(hdr->type) && hdr->data.rdma.use_rdma + if (!am_rdma_is_atomic(hdr->type) && hdr->data.rdma.use_rdma && btl->btl_register_mem) { const uint8_t *handle_data = (const uint8_t *) (hdr + 1); /* the initiator packs these in order of their local and then remote. */ @@ -549,15 +572,16 @@ static mca_btl_base_rdma_operation_t *mca_btl_base_rdma_alloc_operation( return operation; } -static void mca_btl_base_rdma_queue_operation(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t *descriptor, - uint64_t atomic_response, - const mca_btl_base_rdma_hdr_t *hdr, - mca_btl_base_rdma_operation_t *operation) + +static void am_rdma_queue_operation(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t *descriptor, + uint64_t atomic_response, + const am_rdma_hdr_t *hdr, + am_rdma_operation_t *operation) { if (NULL == operation) { - operation = mca_btl_base_rdma_alloc_operation(btl, endpoint, descriptor, hdr); + operation = am_rdma_alloc_operation(btl, endpoint, descriptor, hdr); if (NULL == operation) { /* couldn't even allocate a small amount of memory. not much else can be done. */ BTL_ERROR(("could not allocate memory to queue active-message RDMA operation")); @@ -567,21 +591,22 @@ static void mca_btl_base_rdma_queue_operation(mca_btl_base_module_t *btl, operation->is_queued = true; operation->atomic_response = atomic_response; - OPAL_THREAD_SCOPED_LOCK(&default_module.mutex, - opal_list_append(&default_module.queued_responses, &operation->super)); + OPAL_THREAD_SCOPED_LOCK(&default_component.mutex, + opal_list_append(&default_component.queued_responses, &operation->super)); } -static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t **descriptor, void *addr, - const mca_btl_base_rdma_hdr_t *hdr) + +static int am_rdma_respond(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t **descriptor, void *addr, + const am_rdma_hdr_t *hdr) { mca_btl_base_descriptor_t *send_descriptor = *descriptor; *descriptor = NULL; if (NULL == send_descriptor) { - mca_btl_base_rdma_response_hdr_t *resp_hdr; - size_t data_size = mca_btl_base_rdma_is_atomic(hdr->type) ? hdr->data.atomic.size + am_rdma_response_hdr_t *resp_hdr; + size_t data_size = am_rdma_is_atomic(hdr->type) ? hdr->data.atomic.size : hdr->data.rdma.size; size_t packet_size = sizeof(*resp_hdr) + (addr ? data_size : 0); send_descriptor = btl->btl_alloc(btl, endpoint, MCA_BTL_NO_ORDER, packet_size, @@ -590,7 +615,7 @@ static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl, return OPAL_ERR_OUT_OF_RESOURCE; } - resp_hdr = (mca_btl_base_rdma_response_hdr_t *) send_descriptor->des_segments[0] + resp_hdr = (am_rdma_response_hdr_t *) send_descriptor->des_segments[0] .seg_addr.pval; resp_hdr->context = hdr->context; if (MCA_BTL_BASE_AM_GET == hdr->type) { @@ -602,9 +627,9 @@ static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl, resp_hdr->response_size = data_size; if (NULL != addr) { - mca_btl_base_copy_to_segments((uint64_t)(uintptr_t) addr, packet_size, - sizeof(*resp_hdr), send_descriptor->des_segments, - send_descriptor->des_segment_count); + am_rdma_copy_to_segments((uint64_t)(uintptr_t) addr, packet_size, + sizeof(*resp_hdr), send_descriptor->des_segments, + send_descriptor->des_segment_count); } } @@ -615,7 +640,7 @@ static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl, /* There is no callback for the response descriptor, therefore it is * safe to treat 0 and 1 return codes the same */ - int ret = btl->btl_send(btl, endpoint, send_descriptor, mca_btl_base_rdma_resp_tag()); + int ret = btl->btl_send(btl, endpoint, send_descriptor, am_rdma_resp_tag()); if (ret == 1) { ret = OPAL_SUCCESS; } @@ -626,75 +651,81 @@ static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl, return ret; } + static void -mca_btl_base_am_rmda_rdma_complete(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - struct mca_btl_base_registration_handle_t *local_handle, - void *context, void *cbdata, int status) +am_rdma_rdma_complete(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + struct mca_btl_base_registration_handle_t *local_handle, + void *context, void *cbdata, int status) { - mca_btl_base_rdma_operation_t *operation = (mca_btl_base_rdma_operation_t *) context; + am_rdma_operation_t *operation = (am_rdma_operation_t *) context; BTL_VERBOSE(("BTL RDMA operation complete. status=%d", status)); assert(OPAL_SUCCESS == status); operation->is_completed = true; - int ret = mca_btl_base_am_rdma_respond(operation->btl, operation->endpoint, - &operation->descriptor, NULL, &operation->hdr); + int ret = am_rdma_respond(operation->btl, operation->endpoint, + &operation->descriptor, NULL, &operation->hdr); if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { BTL_VERBOSE( ("could not send a response. queueing the response for later. endpoint=%p, ret=%d", (void*) endpoint, ret)); - mca_btl_base_rdma_queue_operation(btl, NULL, NULL, 0, NULL, operation); + am_rdma_queue_operation(btl, NULL, NULL, 0, NULL, operation); } OBJ_RELEASE(operation); } -static int mca_btl_base_am_rdma_target_get(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t **descriptor, - void *target_address, const mca_btl_base_rdma_hdr_t *hdr, - mca_btl_base_rdma_operation_t **operation) + +static int am_rdma_target_put(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t **descriptor, + const mca_btl_base_segment_t *segments, + size_t segment_count, void *target_address, + const am_rdma_hdr_t *hdr, + am_rdma_operation_t **operation) { if (hdr->data.rdma.use_rdma) { if (NULL == *operation) { - *operation = mca_btl_base_rdma_alloc_operation(btl, endpoint, *descriptor, hdr); + *operation = am_rdma_alloc_operation(btl, endpoint, *descriptor, hdr); if (NULL == *operation) { return OPAL_ERR_OUT_OF_RESOURCE; } } - /* btl supports put but not get. emulating get with put */ + /* btl supports get but not put. emulating put with get */ OBJ_RETAIN(*operation); - int ret = btl->btl_put( + int ret = btl->btl_get( btl, endpoint, target_address, hdr->data.rdma.initiator_address, (struct mca_btl_base_registration_handle_t *) (*operation)->local_handle_data, (struct mca_btl_base_registration_handle_t *) (*operation)->remote_handle_data, - hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, mca_btl_base_am_rmda_rdma_complete, - *operation, NULL); + hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, am_rdma_rdma_complete, + operation, NULL); if (OPAL_SUCCESS != ret) { OBJ_RELEASE(*operation); } + if (OPAL_ERR_NOT_AVAILABLE != ret) { return ret; } + } else if (NULL != segments) { + am_rdma_copy_from_segments(hdr->target_address, sizeof(*hdr), segments, segment_count); } - return mca_btl_base_am_rdma_respond(btl, endpoint, descriptor, target_address, hdr); + return am_rdma_respond(btl, endpoint, descriptor, NULL, hdr); } -static int mca_btl_base_am_rdma_target_put(mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, - mca_btl_base_descriptor_t **descriptor, - const mca_btl_base_segment_t *segments, - size_t segment_count, void *target_address, - const mca_btl_base_rdma_hdr_t *hdr, - mca_btl_base_rdma_operation_t **operation) + +static int am_rdma_target_get(mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + mca_btl_base_descriptor_t **descriptor, + void *target_address, const am_rdma_hdr_t *hdr, + am_rdma_operation_t **operation) { if (hdr->data.rdma.use_rdma) { if (NULL == *operation) { - *operation = mca_btl_base_rdma_alloc_operation(btl, endpoint, *descriptor, hdr); + *operation = am_rdma_alloc_operation(btl, endpoint, *descriptor, hdr); if (NULL == *operation) { return OPAL_ERR_OUT_OF_RESOURCE; } @@ -702,27 +733,25 @@ static int mca_btl_base_am_rdma_target_put(mca_btl_base_module_t *btl, /* btl supports put but not get. emulating get with put */ OBJ_RETAIN(*operation); - int ret = btl->btl_get( + int ret = btl->btl_put( btl, endpoint, target_address, hdr->data.rdma.initiator_address, (struct mca_btl_base_registration_handle_t *) (*operation)->local_handle_data, (struct mca_btl_base_registration_handle_t *) (*operation)->remote_handle_data, - hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, mca_btl_base_am_rmda_rdma_complete, - operation, NULL); + hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, am_rdma_rdma_complete, + *operation, NULL); if (OPAL_SUCCESS != ret) { OBJ_RELEASE(*operation); } - if (OPAL_ERR_NOT_AVAILABLE != ret) { return ret; } - } else if (NULL != segments) { - mca_btl_base_copy_from_segments(hdr->target_address, sizeof(*hdr), segments, segment_count); } - return mca_btl_base_am_rdma_respond(btl, endpoint, descriptor, NULL, hdr); + return am_rdma_respond(btl, endpoint, descriptor, target_address, hdr); } -static void mca_btl_base_rdma_retry_operation(mca_btl_base_rdma_operation_t *operation) + +static void am_rdma_retry_operation(am_rdma_operation_t *operation) { void *target_address = (void *) (intptr_t) operation->hdr.target_address; int ret = OPAL_SUCCESS; @@ -730,43 +759,44 @@ static void mca_btl_base_rdma_retry_operation(mca_btl_base_rdma_operation_t *ope if (!operation->descriptor && !operation->is_completed) { switch (operation->hdr.type) { case MCA_BTL_BASE_AM_GET: - ret = mca_btl_base_am_rdma_target_get(operation->btl, operation->endpoint, - &operation->descriptor, target_address, - &operation->hdr, &operation); + ret = am_rdma_target_get(operation->btl, operation->endpoint, + &operation->descriptor, target_address, + &operation->hdr, &operation); break; case MCA_BTL_BASE_AM_PUT: - ret = mca_btl_base_am_rdma_target_put(operation->btl, operation->endpoint, - &operation->descriptor, - /*segments=*/NULL, - /*segment_count=*/0, target_address, - &operation->hdr, &operation); + ret = am_rdma_target_put(operation->btl, operation->endpoint, + &operation->descriptor, + /*segments=*/NULL, + /*segment_count=*/0, target_address, + &operation->hdr, &operation); break; case MCA_BTL_BASE_AM_ATOMIC: /* atomic operation was completed */ - ret = mca_btl_base_am_rdma_respond(operation->btl, operation->endpoint, - &operation->descriptor, &operation->atomic_response, - &operation->hdr); + ret = am_rdma_respond(operation->btl, operation->endpoint, + &operation->descriptor, &operation->atomic_response, + &operation->hdr); break; } } else { - ret = mca_btl_base_am_rdma_respond(operation->btl, operation->endpoint, - &operation->descriptor, - /*addr=*/NULL, /*hdr=*/NULL); + ret = am_rdma_respond(operation->btl, operation->endpoint, + &operation->descriptor, + /*addr=*/NULL, /*hdr=*/NULL); } if (OPAL_SUCCESS == ret) { if (operation->is_queued) { - opal_list_remove_item(&default_module.queued_responses, &operation->super); + opal_list_remove_item(&default_component.queued_responses, &operation->super); } OBJ_RELEASE(operation); } } -static int mca_btl_base_am_rdma_progress(void) + +static int am_rdma_progress(void) { - if (0 == opal_list_get_size(&default_module.queued_responses) - && 0 == opal_list_get_size(&default_module.queued_initiator_descriptors)) { + if (0 == opal_list_get_size(&default_component.queued_responses) + && 0 == opal_list_get_size(&default_component.queued_initiator_descriptors)) { return 0; } @@ -775,41 +805,42 @@ static int mca_btl_base_am_rdma_progress(void) // (vs. using continuation characters in the use of // OPAL_THREAD_SCOPED_LOCK). #define ACTION1 \ - mca_btl_base_rdma_operation_t *operation, *next; \ - OPAL_LIST_FOREACH_SAFE (operation, next, \ - &default_module.queued_responses, \ - mca_btl_base_rdma_operation_t) { \ - mca_btl_base_rdma_retry_operation(operation); \ + am_rdma_operation_t *operation, *next; \ + OPAL_LIST_FOREACH_SAFE(operation, next, \ + &default_component.queued_responses, \ + am_rdma_operation_t) { \ + am_rdma_retry_operation(operation); \ } - OPAL_THREAD_SCOPED_LOCK(&default_module.mutex, ACTION1); + OPAL_THREAD_SCOPED_LOCK(&default_component.mutex, ACTION1); #define ACTION2 \ - mca_btl_base_am_rdma_queued_descriptor_t *descriptor, *next; \ - OPAL_LIST_FOREACH_SAFE (descriptor, next, \ - &default_module.queued_initiator_descriptors, \ - mca_btl_base_am_rdma_queued_descriptor_t) { \ - mca_btl_base_rdma_context_t *context = \ - (mca_btl_base_rdma_context_t *) \ - descriptor->descriptor->des_context; \ + am_rdma_queued_descriptor_t *descriptor, *next; \ + OPAL_LIST_FOREACH_SAFE(descriptor, next, \ + &default_component.queued_initiator_descriptors, \ + am_rdma_queued_descriptor_t) { \ + am_rdma_context_t *context = \ + (am_rdma_context_t *) descriptor->descriptor->des_context; \ + mca_btl_base_module_t *btl = descriptor->am_module->btl; \ assert(0 != (descriptor->descriptor->des_flags & MCA_BTL_DES_SEND_ALWAYS_CALLBACK)); \ - int ret = descriptor->btl->btl_send(descriptor->btl, \ - descriptor->endpoint, \ - descriptor->descriptor, \ - mca_btl_base_rdma_tag(context->type)); \ - if (OPAL_SUCCESS == ret || 1 == ret) { \ - opal_list_remove_item(&default_module.queued_initiator_descriptors, \ + int ret = btl->btl_send(btl, \ + descriptor->endpoint, \ + descriptor->descriptor, \ + am_rdma_tag(context->type)); \ + if (OPAL_SUCCESS == ret || 1 == ret) { \ + opal_list_remove_item(&default_component.queued_initiator_descriptors, \ &descriptor->super); \ } \ } - OPAL_THREAD_SCOPED_LOCK(&default_module.mutex, ACTION2); + OPAL_THREAD_SCOPED_LOCK(&default_component.mutex, ACTION2); return 0; } -static int mca_btl_base_am_atomic_64(int64_t *operand, opal_atomic_int64_t *addr, - mca_btl_base_atomic_op_t op) + +static int am_rdma_atomic_64(int64_t *operand, opal_atomic_int64_t *addr, + mca_btl_base_atomic_op_t op) { int64_t result = 0; @@ -843,8 +874,9 @@ static int mca_btl_base_am_atomic_64(int64_t *operand, opal_atomic_int64_t *addr return OPAL_SUCCESS; } -static int mca_btl_base_am_atomic_32(int32_t *operand, opal_atomic_int32_t *addr, - mca_btl_base_atomic_op_t op) + +static int am_rdma_atomic_32(int32_t *operand, opal_atomic_int32_t *addr, + mca_btl_base_atomic_op_t op) { int32_t result = 0; @@ -878,16 +910,17 @@ static int mca_btl_base_am_atomic_32(int32_t *operand, opal_atomic_int32_t *addr return OPAL_SUCCESS; } -static void mca_btl_base_am_rdma_response(mca_btl_base_module_t *btl, - const mca_btl_base_receive_descriptor_t *desc) + +static void am_rdma_response(mca_btl_base_module_t *btl, + const mca_btl_base_receive_descriptor_t *desc) { - mca_btl_base_rdma_response_hdr_t *resp_hdr = (mca_btl_base_rdma_response_hdr_t *) desc + am_rdma_response_hdr_t *resp_hdr = (am_rdma_response_hdr_t *) desc ->des_segments[0] .seg_addr.pval; assert(desc->des_segments[0].seg_len >= sizeof(*resp_hdr)); - mca_btl_base_rdma_context_t *context = (mca_btl_base_rdma_context_t *) (uintptr_t) + am_rdma_context_t *context = (am_rdma_context_t *) (uintptr_t) resp_hdr->context; BTL_VERBOSE(("received response for RDMA operation. context=%p, size=%" PRIu64, (void*) context, @@ -901,8 +934,8 @@ static void mca_btl_base_am_rdma_response(mca_btl_base_module_t *btl, /* if there is a result copy it out of the incoming buffer. if RDMA is being used * (get/put or put/get) then the header should be the only thing in the incoming * message. */ - mca_btl_base_copy_from_segments(local_address, sizeof(*resp_hdr), desc->des_segments, - desc->des_segment_count); + am_rdma_copy_from_segments(local_address, sizeof(*resp_hdr), desc->des_segments, + desc->des_segment_count); } } @@ -914,8 +947,9 @@ static void mca_btl_base_am_rdma_response(mca_btl_base_module_t *btl, } } -static void mca_btl_base_am_process_rdma(mca_btl_base_module_t *btl, - const mca_btl_base_receive_descriptor_t *desc) + +static void am_rdma_process_rdma(mca_btl_base_module_t *btl, + const mca_btl_base_receive_descriptor_t *desc) { /* not all btls work with these active message atomics. at this time * all of the affected btls already have atomic support so there is @@ -925,11 +959,11 @@ static void mca_btl_base_am_process_rdma(mca_btl_base_module_t *btl, abort(); } - const mca_btl_base_rdma_hdr_t *hdr = (mca_btl_base_rdma_hdr_t *) desc->des_segments[0] + const am_rdma_hdr_t *hdr = (am_rdma_hdr_t *) desc->des_segments[0] .seg_addr.pval; void *target_address = (void *) (intptr_t) hdr->target_address; mca_btl_base_descriptor_t *descriptor = NULL; - mca_btl_base_rdma_operation_t *operation = NULL; + am_rdma_operation_t *operation = NULL; int ret; BTL_VERBOSE(("got active-message \"RDMA\" request. hdr->context=0x%" PRIx64 @@ -938,11 +972,11 @@ static void mca_btl_base_am_process_rdma(mca_btl_base_module_t *btl, hdr->context, target_address, desc->des_segments[0].seg_len)); if (MCA_BTL_BASE_AM_PUT == hdr->type) { - ret = mca_btl_base_am_rdma_target_put(btl, desc->endpoint, &descriptor, desc->des_segments, + ret = am_rdma_target_put(btl, desc->endpoint, &descriptor, desc->des_segments, desc->des_segment_count, target_address, hdr, &operation); } else if (MCA_BTL_BASE_AM_GET == hdr->type) { - ret = mca_btl_base_am_rdma_target_get(btl, desc->endpoint, &descriptor, target_address, hdr, + ret = am_rdma_target_get(btl, desc->endpoint, &descriptor, target_address, hdr, &operation); } else { BTL_ERROR(("Unexpected tag when processing active-message RDMA request")); @@ -950,12 +984,13 @@ static void mca_btl_base_am_process_rdma(mca_btl_base_module_t *btl, } if (OPAL_SUCCESS != ret) { - mca_btl_base_rdma_queue_operation(btl, desc->endpoint, descriptor, 0, hdr, operation); + am_rdma_queue_operation(btl, desc->endpoint, descriptor, 0, hdr, operation); } } -static void mca_btl_base_am_process_atomic(mca_btl_base_module_t *btl, - const mca_btl_base_receive_descriptor_t *desc) + +static void am_rdma_process_atomic(mca_btl_base_module_t *btl, + const mca_btl_base_receive_descriptor_t *desc) { /* not all btls work with these active message atomics. at this time * all of the affected btls already have atomic support so there is @@ -965,7 +1000,7 @@ static void mca_btl_base_am_process_atomic(mca_btl_base_module_t *btl, abort(); } - const mca_btl_base_rdma_hdr_t *hdr = (mca_btl_base_rdma_hdr_t *) desc->des_segments[0] + const am_rdma_hdr_t *hdr = (am_rdma_hdr_t *) desc->des_segments[0] .seg_addr.pval; uint64_t atomic_response = hdr->data.atomic.operand[0]; @@ -983,14 +1018,14 @@ static void mca_btl_base_am_process_atomic(mca_btl_base_module_t *btl, case MCA_BTL_BASE_AM_ATOMIC: if (4 == hdr->data.atomic.size) { int32_t tmp = (int32_t) atomic_response; - mca_btl_base_am_atomic_32(&tmp, (opal_atomic_int32_t *) hdr->target_address, - hdr->data.atomic.op); + am_rdma_atomic_32(&tmp, (opal_atomic_int32_t *) hdr->target_address, + hdr->data.atomic.op); atomic_response = tmp; } else if (8 == hdr->data.atomic.size) { int64_t tmp = (int64_t) atomic_response; - mca_btl_base_am_atomic_64(&tmp, - (opal_atomic_int64_t *) hdr->target_address, - hdr->data.atomic.op); + am_rdma_atomic_64(&tmp, + (opal_atomic_int64_t *) hdr->target_address, + hdr->data.atomic.op); atomic_response = tmp; } break; @@ -1013,24 +1048,120 @@ static void mca_btl_base_am_process_atomic(mca_btl_base_module_t *btl, } mca_btl_base_descriptor_t *descriptor = NULL; - int ret = mca_btl_base_am_rdma_respond(btl, desc->endpoint, &descriptor, &atomic_response, hdr); + int ret = am_rdma_respond(btl, desc->endpoint, &descriptor, &atomic_response, hdr); if (OPAL_SUCCESS != ret) { - mca_btl_base_rdma_queue_operation(btl, desc->endpoint, descriptor, atomic_response, hdr, + am_rdma_queue_operation(btl, desc->endpoint, descriptor, atomic_response, hdr, NULL); } } -static void mca_btl_sm_sc_emu_init(void) + +static int am_rdma_put(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) { - mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_RDMA].cbfunc - = mca_btl_base_am_process_rdma; - mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_ATOMIC].cbfunc - = mca_btl_base_am_process_atomic; - mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_RDMA_RESP].cbfunc - = mca_btl_base_am_rdma_response; + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_PUT, 0, 0, 0, order, flags, size, + local_address, local_handle, remote_address, remote_handle, + cbfunc, cbcontext, cbdata); +} + + +static int am_rdma_get(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) +{ + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_GET, 0, 0, 0, order, flags, size, + local_address, local_handle, remote_address, remote_handle, + cbfunc, cbcontext, cbdata); +} + + +static int am_rdma_fop(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) +{ + size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8; + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_ATOMIC, operand, 0, op, order, + flags, size, local_address, local_handle, remote_address, + remote_handle, cbfunc, cbcontext, cbdata); +} + + +static int am_rdma_cswap(mca_btl_base_am_rdma_module_t *am_module, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + uint64_t compare, uint64_t value, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) +{ + size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8; + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_CAS, compare, value, 0, order, + flags, size, local_address, local_handle, remote_address, + remote_handle, cbfunc, cbcontext, cbdata); +} + + +static mca_btl_base_am_rdma_module_t *am_rdma_get_module(struct mca_btl_base_module_t *btl) +{ + assert(NULL != btl->btl_am_data); + return (mca_btl_base_am_rdma_module_t *)btl->btl_am_data; +} + + +static int am_rdma_put_wrapper(struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) +{ + mca_btl_base_am_rdma_module_t *am_module = am_rdma_get_module(btl); + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_PUT, 0, 0, 0, order, flags, size, + local_address, local_handle, remote_address, remote_handle, + cbfunc, cbcontext, cbdata); +} + + +static int am_rdma_get_wrapper(struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, void *local_address, + uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) +{ + mca_btl_base_am_rdma_module_t *am_module = am_rdma_get_module(btl); + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_GET, 0, 0, 0, order, flags, size, + local_address, local_handle, remote_address, remote_handle, + cbfunc, cbcontext, cbdata); } -static int mca_btl_base_am_fop(struct mca_btl_base_module_t *btl, + +static int am_rdma_fop_wrapper(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, @@ -1039,96 +1170,204 @@ static int mca_btl_base_am_fop(struct mca_btl_base_module_t *btl, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { + mca_btl_base_am_rdma_module_t *am_module = am_rdma_get_module(btl); size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8; - return mca_btl_base_rdma_start(btl, endpoint, MCA_BTL_BASE_AM_ATOMIC, operand, 0, op, order, - flags, size, local_address, local_handle, remote_address, - remote_handle, cbfunc, cbcontext, cbdata); + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_ATOMIC, operand, 0, op, order, + flags, size, local_address, local_handle, remote_address, + remote_handle, cbfunc, cbcontext, cbdata); } -static int mca_btl_base_am_cswap( - struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle, - mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags, - int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) + +static int am_rdma_cswap_wrapper(struct mca_btl_base_module_t *btl, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + mca_btl_base_registration_handle_t *local_handle, + mca_btl_base_registration_handle_t *remote_handle, + uint64_t compare, uint64_t value, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, + void *cbdata) { + mca_btl_base_am_rdma_module_t *am_module = am_rdma_get_module(btl); size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8; - return mca_btl_base_rdma_start(btl, endpoint, MCA_BTL_BASE_AM_CAS, compare, value, 0, order, - flags, size, local_address, local_handle, remote_address, - remote_handle, cbfunc, cbcontext, cbdata); + + return am_rdma_start(am_module, endpoint, MCA_BTL_BASE_AM_CAS, compare, value, 0, order, + flags, size, local_address, local_handle, remote_address, + remote_handle, cbfunc, cbcontext, cbdata); } -static int mca_btl_base_am_rdma_get(struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, int order, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) + +static void am_rdma_register_callbacks(void) { - return mca_btl_base_rdma_start(btl, endpoint, MCA_BTL_BASE_AM_GET, 0, 0, 0, order, flags, size, - local_address, local_handle, remote_address, remote_handle, - cbfunc, cbcontext, cbdata); + mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_RDMA].cbfunc + = am_rdma_process_rdma; + mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_ATOMIC].cbfunc + = am_rdma_process_atomic; + mca_btl_base_active_message_trigger[MCA_BTL_BASE_TAG_RDMA_RESP].cbfunc + = am_rdma_response; } -static int mca_btl_base_am_rdma_put(struct mca_btl_base_module_t *btl, - struct mca_btl_base_endpoint_t *endpoint, void *local_address, - uint64_t remote_address, - struct mca_btl_base_registration_handle_t *local_handle, - struct mca_btl_base_registration_handle_t *remote_handle, - size_t size, int flags, int order, - mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, - void *cbdata) + +static int am_rdma_internal_init(mca_btl_base_module_t *btl, + uint32_t flags_requested, + bool no_memory_registration, + mca_btl_base_am_rdma_module_t **new_module) { - return mca_btl_base_rdma_start(btl, endpoint, MCA_BTL_BASE_AM_PUT, 0, 0, 0, order, flags, size, - local_address, local_handle, remote_address, remote_handle, - cbfunc, cbcontext, cbdata); + static bool initialized = false; + static opal_mutex_t initialized_mutex = OPAL_MUTEX_STATIC_INIT; + mca_btl_base_am_rdma_module_t *module; + size_t max_operation_size; + size_t operation_alignment; + + opal_mutex_lock(&initialized_mutex); + if (!initialized) { + initialized = true; + OBJ_CONSTRUCT(&default_component, am_rdma_component_t); + opal_progress_register(am_rdma_progress); + am_rdma_register_callbacks(); + } + opal_mutex_unlock(&initialized_mutex); + + module = OBJ_NEW(mca_btl_base_am_rdma_module_t); + if (NULL == module) { + return OPAL_ERR_TEMP_OUT_OF_RESOURCE; + } + + module->btl = btl; + module->use_rdma_put = !!(btl->btl_flags & MCA_BTL_FLAGS_PUT); + module->use_rdma_get = !!(btl->btl_flags & MCA_BTL_FLAGS_GET); + + /* if the requester asked for remote completion and the btl does + * not provide remove completion, we can not use put. + */ + if (!(btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) { + module->use_rdma_put = false; + } + + /* if the requester does not want to do memory registration and + * the BTL requires memory registration, disable the use of RDMA. + */ + if (no_memory_registration && NULL != btl->btl_register_mem) { + module->use_rdma_put = false; + module->use_rdma_get = false; + } + + if (module->use_rdma_get) { + /* implement operations over get. */ + max_operation_size = btl->btl_get_limit; + operation_alignment = btl->btl_get_alignment; + BTL_VERBOSE(("am_rdma_init: btl %p using get. operation size %zu, alignment %zu", + (void *)btl, max_operation_size, operation_alignment)); + } else if (module->use_rdma_put) { + /* implement operations over put. */ + max_operation_size = btl->btl_put_limit; + operation_alignment = btl->btl_put_alignment; + BTL_VERBOSE(("am_rdma_init: btl %p using put. operation size %zu, alignment %zu", + (void *)btl, max_operation_size, operation_alignment)); + } else { + /* implement operations over send. */ + max_operation_size = btl->btl_max_send_size; + operation_alignment = 1; + BTL_VERBOSE(("am_rdma_init: btl %p using send. operation size %zu, alignment %zu", + (void *)btl, max_operation_size, operation_alignment)); + } + + module->am_btl_put_limit = max_operation_size - sizeof(am_rdma_hdr_t); + module->am_btl_put_alignment = operation_alignment; + module->am_btl_get_limit = max_operation_size - sizeof(am_rdma_response_hdr_t); + module->am_btl_get_alignment = operation_alignment; + + module->am_btl_put = am_rdma_put; + module->am_btl_get = am_rdma_get; + module->am_btl_atomic_fop = am_rdma_fop; + module->am_btl_atomic_cswap = am_rdma_cswap; + + *new_module = module; + + return OPAL_SUCCESS; +} + + +static int am_rdma_internal_fini(mca_btl_base_am_rdma_module_t *am_rdma_module) +{ + OBJ_RELEASE(am_rdma_module); + + return OPAL_SUCCESS; } + int mca_btl_base_am_rdma_init(mca_btl_base_module_t *btl) { - static bool progress_registered = false; + mca_btl_base_am_rdma_module_t *am_module; + int ret; + + BTL_VERBOSE(("am_rdma_init: called for btl %s (%p)", + btl->btl_component->btl_version.mca_component_name, (void *)btl)); if ((btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) == (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) { - /* nothing to do */ + BTL_VERBOSE(("am_rdma_init: btl %p already supports rdma", (void *)btl)); return OPAL_SUCCESS; } - size_t max_operation_size = btl->btl_max_send_size; - size_t operation_alignment = 1; - if (mca_btl_base_rdma_use_rdma_get(btl)) { - /* implement put over get. */ - max_operation_size = btl->btl_get_limit; - operation_alignment = btl->btl_get_alignment; - } else if (mca_btl_base_rdma_use_rdma_put(btl)) { - /* implement get over put. */ - max_operation_size = btl->btl_put_limit; - operation_alignment = btl->btl_put_alignment; + /* + * note that it is not safe to access any am rdma functionality + * (even default_component global data) until internal_init returns + * successfully. + */ + ret = am_rdma_internal_init(btl, 0, false, &am_module); + if (OPAL_SUCCESS != ret) { + BTL_VERBOSE(("am_rdma_init: btl %p internal_init failure %d", + (void *)btl, ret)); + return ret; } + /* + * we can't lock any field on the BTL structure (because it's not + * ours to poke at), so take the global am rdma lock. I suppose we + * could do a cswap of the btl_am_data pointer to the same result, + * but that seems too cute for something that should be a relatively + * rare event. + */ + opal_mutex_lock(&default_component.mutex); + if (NULL != btl->btl_am_data) { + BTL_VERBOSE(("am_rdma_init: btl %p already initialized", (void *)btl)); + am_rdma_internal_fini(am_module); + opal_mutex_unlock(&default_component.mutex); + return OPAL_SUCCESS; + } + opal_mutex_unlock(&default_component.mutex); + + btl->btl_am_data = am_module; + + /* TODO: Ideally, we would swap the BTL's flush for our own + * implementation which completed all outstanding transactions on + * that BTL and then called the underlying flush(). Given the + * work and the lack of use case today, we instead just remove + * flush support from the underlying BTL. */ + btl->btl_flush = NULL; + if (!(btl->btl_flags & MCA_BTL_FLAGS_PUT)) { btl->btl_flags |= MCA_BTL_FLAGS_PUT_AM; - btl->btl_put_limit = max_operation_size - sizeof(mca_btl_base_rdma_hdr_t); - btl->btl_put_alignment = operation_alignment; - btl->btl_put = mca_btl_base_am_rdma_put; - BTL_VERBOSE(("Enabling AM-based RDMA put for BTL %p. max put = %zu", (void*) btl, btl->btl_put_limit)); + btl->btl_put_limit = am_module->am_btl_put_limit; + btl->btl_put_alignment = am_module->am_btl_put_alignment; + btl->btl_put = am_rdma_put_wrapper; + BTL_VERBOSE(("am_rdma_init: Enabling AM-based RDMA put for BTL %p. max put = %zu", (void*)btl, btl->btl_put_limit)); } if (!(btl->btl_flags & MCA_BTL_FLAGS_GET)) { btl->btl_flags |= MCA_BTL_FLAGS_GET_AM; - btl->btl_get_limit = max_operation_size - sizeof(mca_btl_base_rdma_response_hdr_t); - btl->btl_get_alignment = operation_alignment; - btl->btl_get = mca_btl_base_am_rdma_get; - BTL_VERBOSE(("Enabling AM-based RDMA get for BTL %p. max get = %zu", (void*) btl, btl->btl_get_limit)); + btl->btl_get_limit = am_module->am_btl_get_limit; + btl->btl_get_alignment = am_module->am_btl_get_alignment; + btl->btl_get = am_rdma_get_wrapper; + BTL_VERBOSE(("Enabling AM-based RDMA get for BTL %p. max get = %zu", (void*)btl, btl->btl_get_limit)); } if (!(btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_FOPS)) { - BTL_VERBOSE(("Enabling AM-based FOPs get for BTL %p", (void*) btl)); btl->btl_flags |= MCA_BTL_FLAGS_ATOMIC_AM_FOP; - btl->btl_atomic_fop = mca_btl_base_am_fop; - btl->btl_atomic_cswap = mca_btl_base_am_cswap; + btl->btl_atomic_fop = am_rdma_fop_wrapper; + btl->btl_atomic_cswap = am_rdma_cswap_wrapper; /* emulated RDMA atomics can support the full range of atomics. for * now only a handful are supported. */ @@ -1137,37 +1376,35 @@ int mca_btl_base_am_rdma_init(mca_btl_base_module_t *btl) | MCA_BTL_ATOMIC_SUPPORTS_AND | MCA_BTL_ATOMIC_SUPPORTS_OR | MCA_BTL_ATOMIC_SUPPORTS_XOR | MCA_BTL_ATOMIC_SUPPORTS_SWAP | MCA_BTL_ATOMIC_SUPPORTS_MIN | MCA_BTL_ATOMIC_SUPPORTS_MAX; + BTL_VERBOSE(("Enabling AM-based FOPs get for BTL %p", (void*)btl)); } - if (!progress_registered) { - progress_registered = true; - opal_progress_register(mca_btl_base_am_rdma_progress); - mca_btl_sm_sc_emu_init(); - OBJ_CONSTRUCT(&default_module, mca_btl_base_am_rdma_module_t); - } + return OPAL_SUCCESS; +} - /* This section check whether we can claim support of remote completion. - * - * In terms of remote completion, we are mainly interested in put and atomic ops, - * because get, atomics fops and atomic cswap support remote completion by their nature. - * - * For active message put (AM put), the target side will send a response, and the initiator - * side will wait for the response to complete the put operation. Thus if AM put is based on send, - * it support remote completion. (If AM put is based on get, it does not support remote - * completion because the target side does not wait for get's completion to send response). - * - * active message RDMA/atomics does not implement atomic ops. User was suppose to - * use atomic fops (unless the btl support atomic ops natively). - * - * In all, the conditions for AM rdma to claim support of remote completion are: - * 1. AM put is enabled (which means the btl does not support put) - * 2. AM put does not use get (so it must use send) - * 3. btl does not have native atomics ops support. - */ - if ((btl->btl_flags & MCA_BTL_FLAGS_PUT_AM) && !mca_btl_base_rdma_use_rdma_get(btl) && - !(btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - btl->btl_flags |= MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION; + +int opal_btl_base_am_rdma_create(mca_btl_base_module_t *btl, + uint32_t flags_requested, + bool no_memory_registration, + mca_btl_base_am_rdma_module_t **am_module) +{ + int ret; + + BTL_VERBOSE(("am_rdma_create: called for btl %s (%p)", + btl->btl_component->btl_version.mca_component_name, (void *)btl)); + + ret = am_rdma_internal_init(btl, flags_requested, no_memory_registration, am_module); + if (OPAL_SUCCESS != ret) { + BTL_VERBOSE(("am_rdma_create: btl %p internal_init failure %d", + (void *)btl, ret)); + return ret; } return OPAL_SUCCESS; } + + +int opal_btl_base_am_rdma_destroy(mca_btl_base_am_rdma_module_t *am_module) +{ + return am_rdma_internal_fini(am_module); +} diff --git a/opal/mca/btl/base/btl_base_am_rdma.h b/opal/mca/btl/base/btl_base_am_rdma.h index 9842f5a8a49..d90983429fa 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.h +++ b/opal/mca/btl/base/btl_base_am_rdma.h @@ -3,6 +3,8 @@ * Copyright (c) 2011-2018 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2020 Google, LLC. All rights reserved. + * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -13,10 +15,46 @@ /** * This file provides support for active-message (send/recv) based RDMA. * It can be used with any btl that provides a minimum of send support but - * can also be used with partial-RDMA BTLs (put only, get only, etc). It - * will provide support for any RDMA or atomic operation not currently - * supported by the supplied BTL. For more info see the description of - * mca_btl_base_am_rdma_init. + * can also be used with partial-RDMA BTLs (put only, get only, etc) + * to provide a complete RDMA interface. + * + * There are two modes of using this interface, depending on your + * requirements: + * + * First, this interface can be used to provide a complete + * put/get/atomic interface for BTLs which do not natively provide + * such an interface. In this mode, active message rdma functions are + * only used if the underlying implementation does not already provide + * the required functionality. For example, if a BTL natively + * supports put but not get, the interface would provide an emulated + * get. The registration, completion and atomicity semantics of the + * BTL remain the native interface's capabilities. That is, if the + * native interface does not provide remote completion or atomics that + * are atomic with processor atomics, neither will the interface after + * initializing the am rdma interface for that BTL. This mode will + * likely give better performance than the second mode for transfers + * that fit within the BTL's native semantics. In this mode, the BTL + * interface is updated so that the btl_{put, get, atomic_fop, + * atomic_cswap} function pointers are usage. However, the btl + * capability flags will not be updated to indicate native support of + * the emulated functionality (for example, if btl_get() is emulated, + * MCA_BTL_FLAGS_GET will not be set). Instead, the emulated flags + * will be set (MCA_BTL_FLAGS_PUT_AM, MCA_BTL_FLAGS_GET_AM, + * MCA_BTL_FLAGS_ATOMIC_AM_FOP, etc.). + * + * Second, this interface can be used to provide different + * sementicsthan a BTL natively provides. This mode is not + * transparent to the caller (unlike the first mode). Instead, the + * caller must manage calling the active message put/get/atomic + * interface directly (rather than through the BTL function pointers). + * For interfaces which require strict remote completion or require + * implicit memory registration, this can greatly simplify the code, + * in return for marginally more management complexity and lower + * performance. + * + * While the calling convention and initialization are different, the + * communication routines uses by the active message rdma + * implementation are identical in both modes of operation. */ #include "opal_config.h" @@ -28,14 +66,86 @@ /** * @brief initialize active-message RDMA/atomic support * - * @inout btl btl module to augment + * @param btl[in,out] btl module to augment + * + * @retval OPAL_SUCCESS btl successfully updated, btl already + * updated, or btl has all available + * functionality natively. + * @retval OPAL_ERR_TEMP_OUT_OF_RESOURCE Allocating BTL-level data + * structure failed. * * This function adds functionality to the btl for any missing RDMA/atomic * operation. Atomic operations are entirely emulated using send/recv and * work best with a btl that also has async-progress enabled. Put/get * support will use either send/recv or get (for put)/put (for get) (if * available). + * + * Note that calling this function will change the BTL interface. + * Care must be taken to not call this function outside of early + * initialization routines. */ int mca_btl_base_am_rdma_init(mca_btl_base_module_t *btl); +struct mca_btl_base_am_rdma_module_t; + +typedef int (*mca_btl_base_am_rdma_module_put_fn_t)( + struct mca_btl_base_am_rdma_module_t *am_btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +typedef int (*mca_btl_base_am_rdma_module_get_fn_t)( + struct mca_btl_base_am_rdma_module_t *am_btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +typedef int (*mca_btl_base_am_rdma_module_atomic_fop64_fn_t)( + struct mca_btl_base_am_rdma_module_t *am_btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op, + uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata); + +typedef int (*mca_btl_base_am_rdma_module_atomic_cswap64_fn_t)( + struct mca_btl_base_am_rdma_module_t *am_btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, + int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata); + +struct mca_btl_base_am_rdma_module_t { + opal_object_t super; + mca_btl_base_module_t *btl; + bool use_rdma_put; + bool use_rdma_get; + + size_t am_btl_put_limit; + size_t am_btl_put_alignment; + size_t am_btl_get_limit; + size_t am_btl_get_alignment; + + mca_btl_base_am_rdma_module_put_fn_t am_btl_put; + mca_btl_base_am_rdma_module_get_fn_t am_btl_get; + mca_btl_base_am_rdma_module_atomic_fop64_fn_t am_btl_atomic_fop; + mca_btl_base_am_rdma_module_atomic_cswap64_fn_t am_btl_atomic_cswap; +}; +typedef struct mca_btl_base_am_rdma_module_t mca_btl_base_am_rdma_module_t; + +OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_btl_base_am_rdma_module_t); + + +/** + * @brief create active-message RDMA/atomics functions + */ +int opal_btl_base_am_rdma_create(mca_btl_base_module_t *btl, + uint32_t flags_requested, + bool no_memory_registration, + mca_btl_base_am_rdma_module_t **am_module); + +int opal_btl_base_am_rdma_destroy(mca_btl_base_am_rdma_module_t *am_module); + #endif /* OPAL_MCA_BTL_BASE_AM_RDMA_H */ diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 80fbc2c2afd..28c71b07530 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -1239,7 +1239,14 @@ struct mca_btl_base_module_t { mca_btl_base_module_flush_fn_t btl_flush; /**< flush all previous operations on an endpoint */ - unsigned char padding[256]; /**< padding to future-proof the btl module */ + + union { + struct { + void *btl_am_data; + }; + unsigned char padding[256]; /**< padding to future-proof the + btl module */ + }; }; typedef struct mca_btl_base_module_t mca_btl_base_module_t;