Skip to content

osc/rdma, btl/tcp: fix various issues with osc/rdma #9400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
23 changes: 15 additions & 8 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ struct ompi_osc_rdma_module_t {
/** value of same_size info key for this window */
bool same_size;

/** CPU atomics can be used */
bool use_cpu_atomics;

/** passive-target synchronization will not be used in this window */
bool no_locks;

Expand Down Expand Up @@ -255,6 +252,21 @@ struct ompi_osc_rdma_module_t {
/** lock for peer hash table/array */
opal_mutex_t peer_lock;

/** flag to indicate wether to use the local leader optimization,
* in which on each node a process was designated as local leader.
* local leader setup a shared memory region, and all other processes
* on the same node map their state to that region. When a process
* want to update a peer's state, the process uses atomics on the peer's
* local leader to update peer's state through shared memory region.
* BTLs that uses active message RDMA cannot support such optimization,
* because active message RDMA uses send/receive to emulate put and
* atomics, so the atomcis and RMA operation must be through the same
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure this is the right solution to the problem. The problem with the local leader optimization is that it assume ordering that AM doesn't give, but it's unclear to me that the entire implementation of OSC RDMA can deal with the AM ordering guarantees. I think we need to step back and answer the ordering guarantees question before adding this complexity.

* ordered channel.
*/
bool use_local_leader;

/** array of peer state. Used when local leader is NOT used */
uintptr_t *peer_state_array;

/** BTL(s) in use. Currently this is only used to support RDMA emulation over
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
Expand Down Expand Up @@ -630,11 +642,6 @@ static inline bool ompi_osc_rdma_oor (int rc)
return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc));
}

__opal_attribute_always_inline__
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
return module->selected_btls[btl_index];
}

__opal_attribute_always_inline__
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) {
if(btl_index == module->selected_btls_size) {
Expand Down
22 changes: 11 additions & 11 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *selected_btl = peer->data_btl;
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;
Expand All @@ -176,7 +176,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const

origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];

return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
return ompi_osc_rdma_btl_fop (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
result_addr, true, NULL, NULL, NULL);
}

Expand All @@ -198,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");

ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8);
ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, address, target_handle, &old_value, 8);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand All @@ -213,7 +213,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt);
}

ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle,
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, address, target_handle,
old_value, new_value, 0, (int64_t*)&new_value);
if (OPAL_SUCCESS != ret || new_value == old_value) {
break;
Expand All @@ -234,7 +234,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *selected_btl = peer->data_btl;
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;
Expand Down Expand Up @@ -262,7 +262,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
*((int64_t *) origin_addr));

/* if we locked the peer its best to wait for completion before returning */
return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin,
return ompi_osc_rdma_btl_op (module, peer->data_btl, peer->data_endpoint, target_address, target_handle, btl_op, origin,
flags, true, NULL, NULL, NULL);
}

Expand Down Expand Up @@ -375,7 +375,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
/* set up the request */
request->to_free = ptr;

ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint,
ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint,
target_address, target_handle, ptr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
Expand Down Expand Up @@ -661,7 +661,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
int32_t atomic_flags = btl->btl_atomic_flags;
const size_t size = datatype->super.size;
int64_t compare, source;
Expand All @@ -679,7 +679,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%"
PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr));

ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle,
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl, peer->data_endpoint, target_address, target_handle,
compare, source, flags, result_addr);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
Expand Down Expand Up @@ -715,7 +715,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
unsigned long len = datatype->super.size;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand All @@ -728,7 +728,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
", sync %p", len, target_address, (void *) sync);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get...");
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address,
ret = ompi_osc_get_data_blocking (module, peer->data_btl, peer->data_endpoint, target_address,
target_handle, result_addr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
Expand Down
14 changes: 7 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ static void ompi_osc_get_data_complete (struct mca_btl_base_module_t *btl, struc
((bool *) context)[0] = true;
}

int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index,
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module,
struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle, void *data, size_t len)
{
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index);
const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -444,7 +444,7 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee
mca_btl_base_registration_handle_t *local_handle, size_t size,
mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) {
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
int ret;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync "
Expand Down Expand Up @@ -481,7 +481,7 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t *
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
mca_btl_base_registration_handle_t *local_handle = NULL;
mca_btl_base_rdma_completion_fn_t cbfunc = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -600,7 +600,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p
ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment);
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand Down Expand Up @@ -736,7 +736,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi
ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
mca_btl_base_registration_handle_t *target_handle;
uint64_t target_address;
int ret;
Expand Down Expand Up @@ -779,7 +779,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori
ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
mca_btl_base_module_t *btl = peer->data_btl;
mca_btl_base_registration_handle_t *source_handle;
uint64_t source_address;
ptrdiff_t source_span, source_lb;
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/osc/rdma/osc_rdma_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or
* @brief read data from a remote memory region (blocking)
*
* @param[in] module osc rdma module
* @param[in] btl btl module
* @param[in] endpoint btl endpoint
* @param[in] source_address remote address to read from
* @param[in] source_handle btl registration handle for remote region (must be valid for the entire region)
Expand All @@ -113,7 +114,8 @@ int ompi_osc_rdma_rget (void *origin_addr, int origin_count, ompi_datatype_t *or
* data that is stored on the remote peer. The peer object does not have to be fully initialized to
* work. Only the btl endpoint is needed.
*/
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_index,
int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module,
struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address,
mca_btl_base_registration_handle_t *source_handle,
void *data, size_t len);
Expand Down
Loading