Skip to content

Commit 36b9ed0

Browse files
authored
Merge pull request #10034 from bwbarrett/backports/all-the-onesided-fixes
v5.0.x: OSC backports
2 parents 563c565 + 3347767 commit 36b9ed0

23 files changed

+1203
-598
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ test/class/ompi_rb_tree
633633
test/class/ompi_bitmap
634634
test/class/opal_bitmap
635635
test/class/opal_fifo
636+
test/class/opal_cstring
636637
test/class/opal_hash_table
637638
test/class/opal_lifo
638639
test/class/opal_list

ompi/mca/osc/base/osc_base_init.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,10 +57,6 @@ ompi_osc_base_select(ompi_win_t *win,
5757
priority = component->osc_query(win, base, size, disp_unit, comm,
5858
win->super.s_info, flavor);
5959
if (priority < 0) {
60-
if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) {
61-
/* NTH: quick fix to return OMPI_ERR_RMA_SHARED */
62-
return OMPI_ERR_RMA_SHARED;
63-
}
6460
continue;
6561
}
6662

ompi/mca/osc/monitoring/osc_monitoring_component.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,6 @@ static int mca_osc_monitoring_component_select(struct ompi_win_t *win, void **ba
9090

9191
priority = component->osc_query(win, base, size, disp_unit, comm, info, flavor);
9292
if (priority < 0) {
93-
if (MPI_WIN_FLAVOR_SHARED == flavor && OMPI_ERR_RMA_SHARED == priority) {
94-
/* NTH: quick fix to return OMPI_ERR_RMA_SHARED */
95-
return OMPI_ERR_RMA_SHARED;
96-
}
9793
continue;
9894
}
9995

ompi/mca/osc/rdma/Makefile.am

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
# Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
1212
# reserved.
1313
# Copyright (c) 2017 IBM Corporation. All rights reserved.
14+
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
15+
# All Rights reserved.
1416
# $COPYRIGHT$
1517
#
1618
# Additional copyrights may follow
@@ -21,6 +23,8 @@
2123
rdma_sources = \
2224
osc_rdma.h \
2325
osc_rdma_module.c \
26+
osc_rdma_btl_comm.h \
27+
osc_rdma_btl_comm.c \
2428
osc_rdma_comm.h \
2529
osc_rdma_comm.c \
2630
osc_rdma_accumulate.c \

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "ompi/mca/osc/osc.h"
4545
#include "ompi/mca/osc/base/base.h"
4646
#include "opal/mca/btl/btl.h"
47+
#include "opal/mca/btl/base/btl_base_am_rdma.h"
4748
#include "ompi/memchecker.h"
4849
#include "ompi/op/op.h"
4950
#include "opal/align.h"
@@ -57,8 +58,6 @@
5758

5859
#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)
5960

60-
#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4
61-
6261
enum {
6362
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
6463
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
@@ -150,9 +149,6 @@ struct ompi_osc_rdma_module_t {
150149
/** value of same_size info key for this window */
151150
bool same_size;
152151

153-
/** CPU atomics can be used */
154-
bool use_cpu_atomics;
155-
156152
/** passive-target synchronization will not be used in this window */
157153
bool no_locks;
158154

@@ -260,18 +256,38 @@ struct ompi_osc_rdma_module_t {
260256
/** lock for peer hash table/array */
261257
opal_mutex_t peer_lock;
262258

259+
/* ******************* communication *********************** */
260+
261+
/* we currently support two modes of operation, a single
262+
* accelerated btl (which can use memory registration and can use
263+
* btl_flush() and one or more alternate btls, which cannot use
264+
* flush() or rely on memory registration. Since it is an
265+
* either/or situation, we use a union to simplify the code.
266+
*/
267+
bool use_accelerated_btl;
268+
269+
union {
270+
struct {
271+
mca_btl_base_module_t *accelerated_btl;
272+
};
273+
struct {
274+
mca_btl_base_am_rdma_module_t **alternate_am_rdmas;
275+
uint8_t alternate_btl_count;
276+
};
277+
};
278+
279+
/** Does the selected BTL require memory registration? This field
280+
will be false when alternate BTLs are used, and the value
281+
when an accelerated BTL is used depends on the registration
282+
requirements of the underlying BTL. */
283+
bool use_memory_registration;
263284

264-
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
265-
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
266-
* could be used to support multiple RDMA-capable BTLs but the memory registration
267-
* paths will need to be updated to pack/unpack multiple registration handles. */
268-
struct mca_btl_base_module_t **selected_btls;
269-
uint8_t selected_btls_size;
270-
uint8_t btls_in_use;
285+
size_t put_alignment;
286+
size_t get_alignment;
287+
size_t put_limit;
288+
size_t get_limit;
271289

272-
/** Only true if one BTL is in use. Memory registration is only supported when
273-
* using a single BTL. */
274-
bool use_memory_registration;
290+
uint32_t atomic_flags;
275291

276292
/** registered fragment used for locally buffered RDMA transfers */
277293
struct ompi_osc_rdma_frag_t *rdma_frag;
@@ -383,10 +399,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
383399
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
384400
{
385401
if (module->use_memory_registration) {
402+
assert(module->use_accelerated_btl);
386403
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
387404
ptr, (void*)((char *) ptr + size), size);
388405

389-
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
406+
*handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags);
390407
if (OPAL_UNLIKELY(NULL == *handle)) {
391408
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
392409
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
@@ -404,7 +421,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
404421
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
405422
{
406423
if (handle) {
407-
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
424+
assert(module->use_memory_registration);
425+
assert(module->use_accelerated_btl);
426+
module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle);
408427
}
409428
}
410429

@@ -536,10 +555,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
536555
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
537556
{
538557
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
539-
return !!(module->selected_btls[0]->btl_flush);
540-
#else
541-
return false;
558+
if (module->use_accelerated_btl) {
559+
return (NULL != module->accelerated_btl->btl_flush);
560+
}
542561
#endif
562+
return false;
543563
}
544564

545565
/**
@@ -601,13 +621,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
601621
opal_progress ();
602622
} while (ompi_osc_rdma_sync_get_count (sync));
603623
#else
604-
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];
605-
606624
do {
607625
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
608626
opal_progress ();
609627
} else {
610-
btl_module->btl_flush (btl_module, NULL);
628+
assert(sync->module->use_accelerated_btl);
629+
mca_btl_base_module_t *btl_module = sync->module->accelerated_btl;
630+
btl_module->btl_flush(btl_module, NULL);
611631
}
612632
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
613633
#endif
@@ -637,17 +657,20 @@ static inline bool ompi_osc_rdma_oor (int rc)
637657

638658
__opal_attribute_always_inline__
639659
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
640-
return module->selected_btls[btl_index];
660+
if (module->use_accelerated_btl) {
661+
assert(0 == btl_index);
662+
return module->accelerated_btl;
663+
} else {
664+
assert(btl_index < module->alternate_btl_count);
665+
return module->alternate_am_rdmas[btl_index]->btl;
666+
}
641667
}
642668

643-
__opal_attribute_always_inline__
644-
static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) {
645-
if(btl_index == module->selected_btls_size) {
646-
module->selected_btls_size *= 2;
647-
module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *));
648-
assert(NULL != module->selected_btls);
649-
}
650-
module->selected_btls[btl_index] = btl;
669+
670+
static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) {
671+
assert(!module->use_accelerated_btl);
672+
assert(btl_index < module->alternate_btl_count);
673+
return module->alternate_am_rdmas[btl_index];
651674
}
652675

653676
#endif /* OMPI_OSC_RDMA_H */

ompi/mca/osc/rdma/osc_rdma_accumulate.c

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,24 @@
1010
* Copyright (c) 2019-2021 Google, LLC. All rights reserved.
1111
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1212
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved
13+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates.
14+
* All Rights reserved.
1315
* $COPYRIGHT$
1416
*
1517
* Additional copyrights may follow
1618
*
1719
* $HEADER$
1820
*/
1921

22+
#include "ompi_config.h"
23+
2024
#include "osc_rdma_accumulate.h"
2125
#include "osc_rdma_request.h"
2226
#include "osc_rdma_comm.h"
27+
#include "osc_rdma_lock.h"
28+
#include "osc_rdma_btl_comm.h"
2329

30+
#include "opal/util/minmax.h"
2431
#include "ompi/mca/osc/base/base.h"
2532
#include "ompi/mca/osc/base/osc_base_obj_convert.h"
2633

@@ -157,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
157164
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
158165
{
159166
ompi_osc_rdma_module_t *module = sync->module;
160-
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
161-
int32_t atomic_flags = selected_btl->btl_atomic_flags;
162167
int btl_op, flags;
163168
int64_t origin;
164169

165-
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
166-
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
170+
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) ||
171+
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) ||
167172
!ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
168173
return OMPI_ERR_NOT_SUPPORTED;
169174
}
@@ -235,19 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
235240
ompi_op_t *op, ompi_osc_rdma_request_t *req)
236241
{
237242
ompi_osc_rdma_module_t *module = sync->module;
238-
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
239-
int32_t atomic_flags = selected_btl->btl_atomic_flags;
240243
int btl_op, flags;
241244
int64_t origin;
242245

243-
if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
244-
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
245-
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
246-
op, req);
247-
}
248-
249-
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
250-
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
246+
if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) ||
247+
(!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) ||
251248
!ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
252249
return OMPI_ERR_NOT_SUPPORTED;
253250
}
@@ -585,9 +582,9 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
585582

586583
/* determine how much to put in this operation */
587584
if (source_count) {
588-
acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
585+
acc_len = opal_min(opal_min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
589586
} else {
590-
acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit);
587+
acc_len = opal_min(target_iovec[target_iov_index].iov_len, acc_limit);
591588
}
592589

593590
if (0 != acc_len) {
@@ -662,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
662659
bool lock_acquired)
663660
{
664661
ompi_osc_rdma_module_t *module = sync->module;
665-
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
666-
int32_t atomic_flags = btl->btl_atomic_flags;
667662
const size_t size = datatype->super.size;
668663
int64_t compare, source;
669664
int flags, ret;
670665

671-
if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) {
666+
if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) {
672667
return OMPI_ERR_NOT_SUPPORTED;
673668
}
674669

@@ -716,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
716711
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
717712
{
718713
ompi_osc_rdma_module_t *module = sync->module;
719-
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
720714
unsigned long len = datatype->super.size;
721715
mca_btl_base_registration_handle_t *local_handle = NULL;
722716
ompi_osc_rdma_frag_t *frag = NULL;
@@ -741,26 +735,30 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
741735
return OMPI_SUCCESS;
742736
}
743737

744-
if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) {
745-
do {
746-
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
747-
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
748-
break;
749-
}
738+
if (module->use_memory_registration) {
739+
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
740+
if (len > btl->btl_put_local_registration_threshold) {
741+
do {
742+
ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr);
743+
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
744+
break;
745+
}
750746

751-
ompi_osc_rdma_progress (module);
752-
} while (1);
747+
ompi_osc_rdma_progress (module);
748+
} while (1);
753749

754-
memcpy (ptr, source_addr, len);
755-
local_handle = frag->handle;
750+
memcpy(ptr, source_addr, len);
751+
local_handle = frag->handle;
752+
}
756753
}
757754

758755
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");
759756

760757
do {
761-
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
762-
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
763-
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
758+
ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint,
759+
ptr, target_address, local_handle, target_handle,
760+
len, 0, MCA_BTL_NO_ORDER,
761+
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
764762
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
765763
break;
766764
}

ompi/mca/osc/rdma/osc_rdma_active_target.c

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -77,33 +77,6 @@ OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t,
7777
ompi_osc_rdma_pending_op_construct,
7878
ompi_osc_rdma_pending_op_destruct);
7979

80-
/**
81-
* Dummy completion function for atomic operations
82-
*/
83-
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
84-
void *local_address, mca_btl_base_registration_handle_t *local_handle,
85-
void *context, void *data, int status)
86-
{
87-
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;
88-
89-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status);
90-
91-
if (pending_op->op_result) {
92-
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
93-
}
94-
95-
if (NULL != pending_op->cbfunc) {
96-
pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status);
97-
}
98-
99-
if (NULL != pending_op->op_frag) {
100-
ompi_osc_rdma_frag_complete (pending_op->op_frag);
101-
pending_op->op_frag = NULL;
102-
}
103-
104-
pending_op->op_complete = true;
105-
OBJ_RELEASE(pending_op);
106-
}
10780

10881
/**
10982
* compare_ranks:

0 commit comments

Comments
 (0)