diff --git a/ompi/mca/osc/rdma/Makefile.am b/ompi/mca/osc/rdma/Makefile.am index e52d0087743..39df7439580 100644 --- a/ompi/mca/osc/rdma/Makefile.am +++ b/ompi/mca/osc/rdma/Makefile.am @@ -41,7 +41,9 @@ rdma_sources = \ osc_rdma_dynamic.c \ osc_rdma_sync.h \ osc_rdma_sync.c \ - osc_rdma_types.h + osc_rdma_types.h \ + osc_rdma_btl_wrapper.c \ + osc_rdma_btl_wrapper.h # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index f194eb2a197..e4c2b177533 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -41,14 +41,13 @@ #include "ompi/request/request.h" #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" -#include "opal/mca/btl/btl.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" +#include "osc_rdma_btl_wrapper.h" #include "osc_rdma_types.h" #include "osc_rdma_sync.h" - #include "osc_rdma_peer.h" #include "opal_stdint.h" @@ -266,7 +265,7 @@ struct ompi_osc_rdma_module_t { * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this * could be used to support multiple RDMA-capable BTLs but the memory registration * paths will need to be updated to pack/unpack multiple registration handles. */ - struct mca_btl_base_module_t **selected_btls; + struct ompi_osc_rdma_btl_wrapper_t **selected_btls; uint8_t selected_btls_size; uint8_t btls_in_use; @@ -387,7 +386,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)", ptr, (void*)((char *) ptr + size), size); - *handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags); + *handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0]->btl_module, endpoint, ptr, size, flags); if (OPAL_UNLIKELY(NULL == *handle)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, " "size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line); @@ -405,7 +404,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file) { if (handle) { - module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle); + module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0]->btl_module, handle); } } @@ -602,7 +601,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) opal_progress (); } while (ompi_osc_rdma_sync_get_count (sync)); #else - mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]; + mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]->btl_module; do { if (!ompi_osc_rdma_use_btl_flush (sync->module)) { @@ -637,18 +636,19 @@ static inline bool ompi_osc_rdma_oor (int rc) } __opal_attribute_always_inline__ -static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { +static inline ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { return module->selected_btls[btl_index]; } __opal_attribute_always_inline__ -static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { +static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index, + enum ompi_osc_rdma_btl_type_t btl_type) { if(btl_index == module->selected_btls_size) { module->selected_btls_size *= 2; module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *)); assert(NULL != module->selected_btls); } - module->selected_btls[btl_index] = btl; + module->selected_btls[btl_index] = ompi_osc_rdma_btl_wrapper_alloc(btl, btl_type); } #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 15f0a80714e..67dea83929a 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -156,7 +156,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; @@ -234,12 +234,12 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + if (!selected_btl->btl_atomic_ops) { /* btl put atomics not supported or disabled. fall back on fetch-and-op */ return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, op, req); @@ -661,7 +661,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; @@ -715,7 +715,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -757,7 +757,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, + ret = btl->btl_put (btl->btl_module, peer->data_endpoint, ptr, target_address, local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.c b/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.c new file mode 100644 index 00000000000..5f003c3ebe5 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.c @@ -0,0 +1,117 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021-2021 Amazon.com, INC. or its affiliates. All rights reserved + */ + +#include "ompi_config.h" +#include "osc_rdma_btl_wrapper.h" +#include "opal/mca/btl/base/btl_base_am_rdma.h" + +/** + * @brief create a btl wrapper for a btl_module + * + * For a primary btl, this function simply copy the data from btl_module + * to btl_wrapper + * + * For an alternate btl, this function create a btl wrapper that will + * always use active message RDMA/atomics on the selected btl module. + * Even when the btl module support RDMA/atomics natively. + * + * The reason osc/rdma does not use an alternate btl's native atomics is because + * When multiple alternate btls are being used, the atomicity accross btl's own + * atomics is not guaranteed. Therefore, osc/rdma must use active message atomics. + * + * The reason osc/rdma does not use an alternate btls' native RDMA put and get is because + * it signficantly simplified osc/rdma's completion. The simplication came in two + * areas: + * + * First, active message RDMA supports remote completion. Remote completion + * is required by several key components of osc/rdma: + * the usage of cpu atomics to update peer's state, + * the usage of local leader to update peer's state, + * osc/rdma's fence implementation. + * + * If any alternate does not support remote completionsc/rdma do not use active message RDMAs, it will + * have to keep track of each selected btl's support of remote completion. + * If any selected btl does not support remote completion, it will have to + * disable the usage of cpu atomics, disable the usage of local leader, + * and implement a different fence mechanism. + * + * Second, active message RDMA does not use memory registration explicitly, + * therefore using it eliminates the need to store and exchange multiple + * memory registrations. + */ +ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_btl_wrapper_alloc(mca_btl_base_module_t *btl_module, enum ompi_osc_rdma_btl_type_t btl_type) +{ + mca_btl_base_module_t btl_module_copy; + ompi_osc_rdma_btl_wrapper_t *btl_wrapper; + + if (NULL == btl_module) { + return NULL; + } + + memcpy(&btl_module_copy, btl_module, sizeof(btl_module_copy)); + if (OMPI_OSC_RDMA_BTL_ALTERNATE == btl_type) { + /* For an alternate btl, osc/rdma must use active message RDMA/atomics on it, + * and not use the btl's native support of RDMA/atomics. + * + * mca_btl_base_am_rdma_init() setup a btl to use AM atomics/RDMA. However, + * if a btl has native RDMA/atomics support, this function will not replace + * them with active message RDMA/atomic. Therefore, to ensure active message + * RDMA/atomic will replace native RDMA/atomics, the input btl_module to + * mca_btl_base_am_rdma_init() must has native RDMA/atomics disabled. + * + * We cannot disable the btl_module's RDMA/atomic though, because btl_module's + * native RDMA/atomic may be used by other modules like pml. + * + * Hence, we created btl_module_copy here, and disabled its native RDMA/atomic + * and call mca_btl_base_am_rdma_init() on it. The resulted btl_module_copy + * has all the correct property test for AM RDMA/atomics. These properties + * are then copied to btl_wrapper. + * + * osc/rdma uses the btl_wrapper to ensure active message RDMA/atomics are + * used for the btl_module. + */ + btl_module_copy.btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS); + mca_btl_base_am_rdma_init(&btl_module_copy); + /* AM rdma/atomics does not need explicit memory registration + */ + btl_module_copy.btl_register_mem = NULL; + btl_module_copy.btl_deregister_mem = NULL; + btl_module_copy.btl_registration_handle_size = 0; + } + + btl_wrapper = (ompi_osc_rdma_btl_wrapper_t *)calloc(1, sizeof(ompi_osc_rdma_btl_wrapper_t)); + if (NULL == btl_wrapper) { + return NULL; + } + + btl_wrapper->btl_module = btl_module; + + btl_wrapper->btl_atomic_ops = (btl_module_copy.btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS); + btl_wrapper->btl_atomic_support_glob = (btl_module_copy.btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + btl_wrapper->btl_rdma_remote_completion = (btl_module_copy.btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION); + + btl_wrapper->btl_put = btl_module_copy.btl_put; + btl_wrapper->btl_put_limit = btl_module_copy.btl_put_limit; + btl_wrapper->btl_put_alignment = btl_module_copy.btl_put_alignment; + btl_wrapper->btl_put_local_registration_threshold = btl_module_copy.btl_put_local_registration_threshold; + + btl_wrapper->btl_get = btl_module_copy.btl_get; + btl_wrapper->btl_get_limit = btl_module_copy.btl_get_limit; + btl_wrapper->btl_get_alignment = btl_module_copy.btl_get_alignment; + btl_wrapper->btl_get_local_registration_threshold = btl_module_copy.btl_get_local_registration_threshold; + + btl_wrapper->btl_atomic_op = btl_module_copy.btl_atomic_op; + btl_wrapper->btl_atomic_fop = btl_module_copy.btl_atomic_fop; + btl_wrapper->btl_atomic_cswap = btl_module_copy.btl_atomic_cswap; + btl_wrapper->btl_atomic_flags = btl_module_copy.btl_atomic_flags; + + btl_wrapper->btl_register_mem = btl_module_copy.btl_register_mem; + btl_wrapper->btl_deregister_mem = btl_module_copy.btl_deregister_mem; + btl_wrapper->btl_registration_handle_size = btl_module_copy.btl_registration_handle_size; + + btl_wrapper->btl_flush = btl_module_copy.btl_flush; + return btl_wrapper; +} + diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.h b/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.h new file mode 100644 index 00000000000..3f38ef50161 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_wrapper.h @@ -0,0 +1,53 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2021-2021 Amazon.com, INC. or its affiliates. All rights reserved + */ + +#ifndef OMPI_OSC_RDMA_BTL_WRAPPER_T +#define OMPI_OSC_RDMA_BTL_WRAPPER_T + +#include "opal/mca/btl/btl.h" + +enum ompi_osc_rdma_btl_type_t { + OMPI_OSC_RDMA_BTL_PRIMARY, + OMPI_OSC_RDMA_BTL_ALTERNATE, +}; + +/** + * @brief ompi_osc_rdma_btl_warpper_t is a subset of mca_btl_base_module_t + * that are used by osc/rdma component + */ +struct ompi_osc_rdma_btl_wrapper_t { + mca_btl_base_module_t *btl_module; + + bool btl_atomic_ops; + bool btl_rdma_remote_completion; + bool btl_atomic_support_glob; + + mca_btl_base_module_put_fn_t btl_put; + size_t btl_put_limit; + size_t btl_put_alignment; + size_t btl_put_local_registration_threshold; + + mca_btl_base_module_get_fn_t btl_get; + size_t btl_get_limit; + size_t btl_get_alignment; + size_t btl_get_local_registration_threshold; + + mca_btl_base_module_atomic_op64_fn_t btl_atomic_op; + mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop; + mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap; + uint32_t btl_atomic_flags; + + mca_btl_base_module_register_mem_fn_t btl_register_mem; + mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem; + size_t btl_registration_handle_size; + + mca_btl_base_module_flush_fn_t btl_flush; +}; + +typedef struct ompi_osc_rdma_btl_wrapper_t ompi_osc_rdma_btl_wrapper_t; + +ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_btl_wrapper_alloc(mca_btl_base_module_t *btl_module, enum ompi_osc_rdma_btl_type_t btl_type); + +#endif diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..49aad36bf35 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -58,7 +58,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -96,7 +96,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); do { - ret = btl->btl_get (btl, endpoint, ptr, aligned_addr, + ret = btl->btl_get (btl->btl_module, endpoint, ptr, aligned_addr, local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_get_data_complete, (void *) &read_complete, NULL); if (!ompi_osc_rdma_oor (ret)) { @@ -444,7 +444,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -454,7 +454,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee ompi_osc_rdma_sync_rdma_inc (sync); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, + ret = btl->btl_put (btl->btl_module, peer->data_endpoint, ptr, target_address, local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER, cb, context, cbdata); if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { @@ -481,7 +481,7 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -600,7 +600,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -703,7 +703,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } do { - ret = btl->btl_get (btl, peer->data_endpoint, ptr, + ret = btl->btl_get (btl->btl_module, peer->data_endpoint, ptr, aligned_source_base, local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, request, frag); @@ -736,7 +736,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -779,7 +779,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 12937ffc644..44bea07cd69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -85,7 +85,6 @@ static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, co static char *ompi_osc_rdma_btl_names; static char *ompi_osc_rdma_mtl_names; -static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, @@ -266,14 +265,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names); free(description_str); - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); - free(description_str); - ompi_osc_rdma_mtl_names = "psm2"; opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma " "osc component (default: %s)", ompi_osc_rdma_mtl_names); @@ -610,7 +601,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (!module->single_node) { for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_support_glob); } } @@ -919,56 +910,38 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL that does not provide true RDMA but - * can use active messages using the BTL base AM RDMA/atomics. Since more than - * one BTL may be needed for this support the OSC component will disable the - * use of registration-based RDMA (these BTLs will not be used) and will use - * any remaining BTL. By default the BTLs used will be tcp and sm but any single - * (or pair) of BTLs may be used. + * This function is used when there ompi_osc_rdm_query_btls() failed to find + * a single btl that can communicate with all peers and supports remote completion. + * In this case, osc/rdma will use mulitple btls for communications. One process + * can use different btl to communicate with different peer. Such btls are called + * "alternate btls". + * This function call ompi_osc_rdma_selected_btl_insert() to insert the alterate + * btl into selected_btls. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { mca_btl_base_selected_module_t *item; - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); int btls_found = 0; - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - if (NULL == btls_to_use) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); - return OMPI_ERR_UNREACH; - } - if (module) { module->btls_in_use = 0; } /* rdma and atomics are only supported with BTLs at the moment */ - for (int i = 0 ; btls_to_use[i] ; ++i) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { - if (NULL != item->btl_module->btl_register_mem) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL"); - continue; - } + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); - continue; - } - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]); - - ++btls_found; - if (module) { - mca_btl_base_am_rdma_init(item->btl_module); - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); - } - + ++btls_found; + if (module) { + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++, OMPI_OSC_RDMA_BTL_ALTERNATE); } } - opal_argv_free (btls_to_use); + /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ + if (NULL != module) { + module->use_memory_registration = false; + } return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; } @@ -989,7 +962,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo btls_to_use = opal_argv_split (ompi_osc_rdma_btl_names, ','); if (module) { - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); + ompi_osc_rdma_selected_btl_insert(module, NULL, 0, OMPI_OSC_RDMA_BTL_PRIMARY); module->btls_in_use = 0; module->use_memory_registration = false; } @@ -1003,7 +976,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo } if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA && - (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) { + (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION))) { if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) { selected_btl = item->btl_module; } @@ -1016,7 +989,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo if (NULL != selected_btl) { if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); + ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0, OMPI_OSC_RDMA_BTL_PRIMARY); module->btls_in_use = 1; module->use_memory_registration = selected_btl->btl_register_mem != NULL; } @@ -1072,10 +1045,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo btl_counts = tmp; for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) { - /* for this implementation we need only compare-and-swap and fetch-and-add */ + /* for this implementation we need only compare-and-swap and fetch-and-add + * + * If a btl does not support remote completion, it cannot be used as the primary btl. + * It can still be selected as an alternate btl */ if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) == (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags & - MCA_BTL_ATOMIC_SUPPORTS_ADD)) { + MCA_BTL_ATOMIC_SUPPORTS_ADD) && + (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) { for (int j = 0 ; j < max_btls ; ++j) { if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) { ++btl_counts[j]; @@ -1133,7 +1110,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo } if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); + ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0, OMPI_OSC_RDMA_BTL_PRIMARY); module->btls_in_use = 1; module->use_memory_registration = selected_btl->btl_register_mem != NULL; } diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..a2d514a53bc 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -44,7 +44,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret = OPAL_ERROR; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -72,7 +72,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, + ret = selected_btl->btl_atomic_fop (selected_btl->btl_module, endpoint, pending_op->op_buffer, (intptr_t) address, pending_op->op_frag->handle, address_handle, op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); @@ -88,7 +88,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t if (OPAL_LIKELY(1 == ret)) { *result = ((int64_t *) pending_op->op_buffer)[0]; ret = OMPI_SUCCESS; - ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, + ompi_osc_rdma_atomic_complete (selected_btl->btl_module, endpoint, pending_op->op_buffer, pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); } else { /* need to release here because ompi_osc_rdma_atomic_complete was not called */ @@ -122,10 +122,10 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + if (!selected_btl->btl_atomic_ops) { return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } @@ -147,7 +147,7 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t /* spin until the btl has accepted the operation */ do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, + ret = selected_btl->btl_atomic_op (selected_btl->btl_module, endpoint, (intptr_t) address, address_handle, op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); @@ -192,7 +192,7 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8 int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -209,7 +209,7 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8 ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, + ret = selected_btl->btl_atomic_cswap (selected_btl->btl_module, endpoint, pending_op->op_buffer, address, pending_op->op_frag->handle, address_handle, compare, value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..a1a8e8ec834 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -50,7 +50,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { + if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]->btl_module) { *btl_index_out = module_btl_index; *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; @@ -63,7 +63,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { + if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]->btl_module) { *btl_index_out = module_btl_index; *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 73fb4c000e8..9b47ae083e4 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -465,10 +465,8 @@ mca_btl_base_rdma_start(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint if (sizeof(*hdr) + size <= btl->btl_eager_limit) { /* just go ahead and send the data */ packet_size += size; - } else if (!mca_btl_base_rdma_use_rdma_get (btl)) { - packet_size += size_t_min (size, btl->btl_max_send_size - sizeof (*hdr)); } else { - use_rdma = true; + packet_size += size_t_min (size, btl->btl_max_send_size - sizeof (*hdr)); } } else if (MCA_BTL_BASE_AM_GET == type) { if (!mca_btl_base_rdma_use_rdma_put(btl)) {