Skip to content

osc/rdma: adjustment on btl selection logic #9696

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ompi/mca/osc/rdma/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ rdma_sources = \
osc_rdma_dynamic.c \
osc_rdma_sync.h \
osc_rdma_sync.c \
osc_rdma_types.h
osc_rdma_types.h \
osc_rdma_btl_wrapper.c \
osc_rdma_btl_wrapper.h

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
Expand Down
18 changes: 9 additions & 9 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,13 @@
#include "ompi/request/request.h"
#include "ompi/mca/osc/osc.h"
#include "ompi/mca/osc/base/base.h"
#include "opal/mca/btl/btl.h"
#include "ompi/memchecker.h"
#include "ompi/op/op.h"
#include "opal/align.h"

#include "osc_rdma_btl_wrapper.h"
#include "osc_rdma_types.h"
#include "osc_rdma_sync.h"

#include "osc_rdma_peer.h"

#include "opal_stdint.h"
Expand Down Expand Up @@ -266,7 +265,7 @@ struct ompi_osc_rdma_module_t {
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
* could be used to support multiple RDMA-capable BTLs but the memory registration
* paths will need to be updated to pack/unpack multiple registration handles. */
struct mca_btl_base_module_t **selected_btls;
struct ompi_osc_rdma_btl_wrapper_t **selected_btls;
uint8_t selected_btls_size;
uint8_t btls_in_use;

Expand Down Expand Up @@ -387,7 +386,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
ptr, (void*)((char *) ptr + size), size);

*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0]->btl_module, endpoint, ptr, size, flags);
if (OPAL_UNLIKELY(NULL == *handle)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
Expand All @@ -405,7 +404,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
{
if (handle) {
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0]->btl_module, handle);
}
}

Expand Down Expand Up @@ -602,7 +601,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
opal_progress ();
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]->btl_module;

do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
Expand Down Expand Up @@ -637,18 +636,19 @@ static inline bool ompi_osc_rdma_oor (int rc)
}

__opal_attribute_always_inline__
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
static inline ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
return module->selected_btls[btl_index];
}

__opal_attribute_always_inline__
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) {
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index,
enum ompi_osc_rdma_btl_type_t btl_type) {
if(btl_index == module->selected_btls_size) {
module->selected_btls_size *= 2;
module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *));
assert(NULL != module->selected_btls);
}
module->selected_btls[btl_index] = btl;
module->selected_btls[btl_index] = ompi_osc_rdma_btl_wrapper_alloc(btl, btl_type);
}

#endif /* OMPI_OSC_RDMA_H */
12 changes: 6 additions & 6 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;
Expand Down Expand Up @@ -234,12 +234,12 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
if (!selected_btl->btl_atomic_ops) {
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
op, req);
Expand Down Expand Up @@ -661,7 +661,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = btl->btl_atomic_flags;
const size_t size = datatype->super.size;
int64_t compare, source;
Expand Down Expand Up @@ -715,7 +715,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
unsigned long len = datatype->super.size;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -757,7 +757,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");

do {
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
ret = btl->btl_put (btl->btl_module, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
Expand Down
117 changes: 117 additions & 0 deletions ompi/mca/osc/rdma/osc_rdma_btl_wrapper.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2021-2021 Amazon.com, INC. or its affiliates. All rights reserved
*/

#include "ompi_config.h"
#include "osc_rdma_btl_wrapper.h"
#include "opal/mca/btl/base/btl_base_am_rdma.h"

/**
* @brief create a btl wrapper for a btl_module
*
* For a primary btl, this function simply copy the data from btl_module
* to btl_wrapper
*
* For an alternate btl, this function create a btl wrapper that will
* always use active message RDMA/atomics on the selected btl module.
* Even when the btl module support RDMA/atomics natively.
*
* The reason osc/rdma does not use an alternate btl's native atomics is because
* When multiple alternate btls are being used, the atomicity accross btl's own
* atomics is not guaranteed. Therefore, osc/rdma must use active message atomics.
*
* The reason osc/rdma does not use an alternate btls' native RDMA put and get is because
* it signficantly simplified osc/rdma's completion. The simplication came in two
* areas:
*
* First, active message RDMA supports remote completion. Remote completion
* is required by several key components of osc/rdma:
* the usage of cpu atomics to update peer's state,
* the usage of local leader to update peer's state,
* osc/rdma's fence implementation.
*
* If any alternate does not support remote completionsc/rdma do not use active message RDMAs, it will
* have to keep track of each selected btl's support of remote completion.
* If any selected btl does not support remote completion, it will have to
* disable the usage of cpu atomics, disable the usage of local leader,
* and implement a different fence mechanism.
*
* Second, active message RDMA does not use memory registration explicitly,
* therefore using it eliminates the need to store and exchange multiple
* memory registrations.
*/
ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_btl_wrapper_alloc(mca_btl_base_module_t *btl_module, enum ompi_osc_rdma_btl_type_t btl_type)
{
mca_btl_base_module_t btl_module_copy;
ompi_osc_rdma_btl_wrapper_t *btl_wrapper;

if (NULL == btl_module) {
return NULL;
}

memcpy(&btl_module_copy, btl_module, sizeof(btl_module_copy));
if (OMPI_OSC_RDMA_BTL_ALTERNATE == btl_type) {
/* For an alternate btl, osc/rdma must use active message RDMA/atomics on it,
* and not use the btl's native support of RDMA/atomics.
*
* mca_btl_base_am_rdma_init() setup a btl to use AM atomics/RDMA. However,
* if a btl has native RDMA/atomics support, this function will not replace
* them with active message RDMA/atomic. Therefore, to ensure active message
* RDMA/atomic will replace native RDMA/atomics, the input btl_module to
* mca_btl_base_am_rdma_init() must has native RDMA/atomics disabled.
*
* We cannot disable the btl_module's RDMA/atomic though, because btl_module's
* native RDMA/atomic may be used by other modules like pml.
*
* Hence, we created btl_module_copy here, and disabled its native RDMA/atomic
* and call mca_btl_base_am_rdma_init() on it. The resulted btl_module_copy
* has all the correct property test for AM RDMA/atomics. These properties
* are then copied to btl_wrapper.
*
* osc/rdma uses the btl_wrapper to ensure active message RDMA/atomics are
* used for the btl_module.
*/
btl_module_copy.btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS);
mca_btl_base_am_rdma_init(&btl_module_copy);
/* AM rdma/atomics does not need explicit memory registration
*/
btl_module_copy.btl_register_mem = NULL;
btl_module_copy.btl_deregister_mem = NULL;
btl_module_copy.btl_registration_handle_size = 0;
}

btl_wrapper = (ompi_osc_rdma_btl_wrapper_t *)calloc(1, sizeof(ompi_osc_rdma_btl_wrapper_t));
if (NULL == btl_wrapper) {
return NULL;
}

btl_wrapper->btl_module = btl_module;

btl_wrapper->btl_atomic_ops = (btl_module_copy.btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS);
btl_wrapper->btl_atomic_support_glob = (btl_module_copy.btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
btl_wrapper->btl_rdma_remote_completion = (btl_module_copy.btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION);

btl_wrapper->btl_put = btl_module_copy.btl_put;
btl_wrapper->btl_put_limit = btl_module_copy.btl_put_limit;
btl_wrapper->btl_put_alignment = btl_module_copy.btl_put_alignment;
btl_wrapper->btl_put_local_registration_threshold = btl_module_copy.btl_put_local_registration_threshold;

btl_wrapper->btl_get = btl_module_copy.btl_get;
btl_wrapper->btl_get_limit = btl_module_copy.btl_get_limit;
btl_wrapper->btl_get_alignment = btl_module_copy.btl_get_alignment;
btl_wrapper->btl_get_local_registration_threshold = btl_module_copy.btl_get_local_registration_threshold;

btl_wrapper->btl_atomic_op = btl_module_copy.btl_atomic_op;
btl_wrapper->btl_atomic_fop = btl_module_copy.btl_atomic_fop;
btl_wrapper->btl_atomic_cswap = btl_module_copy.btl_atomic_cswap;
btl_wrapper->btl_atomic_flags = btl_module_copy.btl_atomic_flags;

btl_wrapper->btl_register_mem = btl_module_copy.btl_register_mem;
btl_wrapper->btl_deregister_mem = btl_module_copy.btl_deregister_mem;
btl_wrapper->btl_registration_handle_size = btl_module_copy.btl_registration_handle_size;

btl_wrapper->btl_flush = btl_module_copy.btl_flush;
return btl_wrapper;
}

53 changes: 53 additions & 0 deletions ompi/mca/osc/rdma/osc_rdma_btl_wrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2021-2021 Amazon.com, INC. or its affiliates. All rights reserved
*/

#ifndef OMPI_OSC_RDMA_BTL_WRAPPER_T
#define OMPI_OSC_RDMA_BTL_WRAPPER_T

#include "opal/mca/btl/btl.h"

enum ompi_osc_rdma_btl_type_t {
OMPI_OSC_RDMA_BTL_PRIMARY,
OMPI_OSC_RDMA_BTL_ALTERNATE,
};

/**
* @brief ompi_osc_rdma_btl_warpper_t is a subset of mca_btl_base_module_t
* that are used by osc/rdma component
*/
struct ompi_osc_rdma_btl_wrapper_t {
mca_btl_base_module_t *btl_module;

bool btl_atomic_ops;
bool btl_rdma_remote_completion;
bool btl_atomic_support_glob;

mca_btl_base_module_put_fn_t btl_put;
size_t btl_put_limit;
size_t btl_put_alignment;
size_t btl_put_local_registration_threshold;

mca_btl_base_module_get_fn_t btl_get;
size_t btl_get_limit;
size_t btl_get_alignment;
size_t btl_get_local_registration_threshold;

mca_btl_base_module_atomic_op64_fn_t btl_atomic_op;
mca_btl_base_module_atomic_fop64_fn_t btl_atomic_fop;
mca_btl_base_module_atomic_cswap64_fn_t btl_atomic_cswap;
uint32_t btl_atomic_flags;

mca_btl_base_module_register_mem_fn_t btl_register_mem;
mca_btl_base_module_deregister_mem_fn_t btl_deregister_mem;
size_t btl_registration_handle_size;

mca_btl_base_module_flush_fn_t btl_flush;
};

typedef struct ompi_osc_rdma_btl_wrapper_t ompi_osc_rdma_btl_wrapper_t;

ompi_osc_rdma_btl_wrapper_t *ompi_osc_rdma_btl_wrapper_alloc(mca_btl_base_module_t *btl_module, enum ompi_osc_rdma_btl_type_t btl_type);

#endif
18 changes: 9 additions & 9 deletions ompi/mca/osc/rdma/osc_rdma_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde
struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *data, size_t len)
{
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, btl_index);
const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -96,7 +96,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde
assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment)));

do {
ret = btl->btl_get (btl, endpoint, ptr, aligned_addr,
ret = btl->btl_get (btl->btl_module, endpoint, ptr, aligned_addr,
local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER,
ompi_osc_get_data_complete, (void *) &read_complete, NULL);
if (!ompi_osc_rdma_oor (ret)) {
Expand Down Expand Up @@ -444,7 +444,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
mca_btl_base_registration_handle_t *local_handle, size_t size,
mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int ret;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync "
Expand All @@ -454,7 +454,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
ompi_osc_rdma_sync_rdma_inc (sync);

do {
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
ret = btl->btl_put (btl->btl_module, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER,
cb, context, cbdata);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
Expand All @@ -481,7 +481,7 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -600,7 +600,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -703,7 +703,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
}

do {
ret = btl->btl_get (btl, peer->data_endpoint, ptr,
ret = btl->btl_get (btl->btl_module, peer->data_endpoint, ptr,
aligned_source_base, local_handle, source_handle,
aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete,
request, frag);
Expand Down Expand Up @@ -736,7 +736,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi
ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_registration_handle_t *target_handle;
uint64_t target_address;
int ret;
Expand Down Expand Up @@ -779,7 +779,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori
ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
ompi_osc_rdma_btl_wrapper_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_registration_handle_t *source_handle;
uint64_t source_address;
ptrdiff_t source_span, source_lb;
Expand Down
Loading