Skip to content

Commit e67abc8

Browse files
committed
osc/rdma: performance improvments and bug fixes
This commit is a large update to the osc/rdma component. Included in this commit: - Add support for using hardware atomics for fetch-and-op and single count accumulate when using the accumulate lock. This will improve the performance of these operations even when not setting the single intrinsic info key. - Rework how large accumulates are done. They now block on the get operation to fix some bugs discovered by an IBM one-sided test. I may roll back some of the changes if the underlying bug in the original design is discovered. There appear to be no real difference (on the hardware this was tested with) in performance so its probably a non-issue. References #2530. - Add support for an additional lock-all algorithm: on-demand. The on-demand algorithm will attempt to acquire the peer lock when starting an RMA operation. The lock algorithm default has not changed. The algorithm can be selected by setting the osc_rdma_locking_mode MCA variable. The valid values are two_level and on_demand. - Make use of the btl_flush function if available. This can improve performance with some btls. - When using btl_flush do not keep track of the number of put operations. This reduces the number of atomic operations in the critical path. - Make the window buffers more friendly to multi-threaded applications. This was done by dropping support for multiple buffers per MPI window. I intend to re-add that support once the underlying performance bug under the old buffering scheme is fixed. - Fix a bug in request completion in the accumulate, get, and put paths. This also helps with #2530. - General code cleanup and fixes. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent 5f58e7b commit e67abc8

16 files changed

+975
-874
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 106 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* University of Stuttgart. All rights reserved.
99
* Copyright (c) 2004-2005 The Regents of the University of California.
1010
* All rights reserved.
11-
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
11+
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
1212
* reserved.
1313
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
@@ -50,6 +50,11 @@
5050

5151
#include "opal_stdint.h"
5252

53+
enum {
54+
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
55+
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
56+
};
57+
5358
/**
5459
* @brief osc rdma component structure
5560
*/
@@ -87,6 +92,9 @@ struct ompi_osc_rdma_component_t {
8792
/** Default value of the no_locks info key for new windows */
8893
bool no_locks;
8994

95+
/** Locking mode to use as the default for all windows */
96+
int locking_mode;
97+
9098
/** Accumulate operations will only operate on a single intrinsic datatype */
9199
bool acc_single_intrinsic;
92100

@@ -119,6 +127,8 @@ struct ompi_osc_rdma_module_t {
119127
/** Mutex lock protecting module data */
120128
opal_mutex_t lock;
121129

130+
/** locking mode to use */
131+
int locking_mode;
122132

123133
/* window configuration */
124134

@@ -147,10 +157,12 @@ struct ompi_osc_rdma_module_t {
147157
/** Local displacement unit. */
148158
int disp_unit;
149159

150-
151160
/** global leader */
152161
ompi_osc_rdma_peer_t *leader;
153162

163+
/** my peer structure */
164+
ompi_osc_rdma_peer_t *my_peer;
165+
154166
/** pointer to free on cleanup (may be NULL) */
155167
void *free_after;
156168

@@ -276,6 +288,16 @@ int ompi_osc_rdma_free (struct ompi_win_t *win);
276288
*/
277289
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
278290

291+
/**
292+
* @brief demand lock a peer
293+
*
294+
* @param[in] module osc rdma module
295+
* @param[in] peer peer to lock
296+
*
297+
* @returns OMPI_SUCCESS on success
298+
*/
299+
int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);
300+
279301
/**
280302
* @brief check if a peer object is cached for a remote rank
281303
*
@@ -449,10 +471,18 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
449471
}
450472

451473
return NULL;
452-
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
453474
case OMPI_OSC_RDMA_SYNC_TYPE_LOCK:
454-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence/lock_all access epoch for target %d", target);
475+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found lock_all access epoch for target %d", target);
476+
477+
*peer = ompi_osc_rdma_module_peer (module, target);
478+
if (OPAL_UNLIKELY(OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode &&
479+
!ompi_osc_rdma_peer_is_demand_locked (*peer))) {
480+
ompi_osc_rdma_demand_lock_peer (module, *peer);
481+
}
455482

483+
return &module->all_sync;
484+
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
485+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence access epoch for target %d", target);
456486
/* fence epoch is now active */
457487
module->all_sync.epoch_active = true;
458488
*peer = ompi_osc_rdma_module_peer (module, target);
@@ -470,25 +500,94 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
470500
return NULL;
471501
}
472502

503+
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
504+
{
505+
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
506+
return !!(module->selected_btl->btl_flush);
507+
#else
508+
return false;
509+
#endif
510+
}
511+
512+
/**
513+
* @brief increment the outstanding rdma operation counter (atomic)
514+
*
515+
* @param[in] rdma_sync osc rdma synchronization object
516+
*/
517+
static inline void ompi_osc_rdma_sync_rdma_inc_always (ompi_osc_rdma_sync_t *rdma_sync)
518+
{
519+
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, 1);
520+
521+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations",
522+
(unsigned long) rdma_sync->outstanding_rdma.counter);
523+
}
524+
525+
static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
526+
{
527+
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
528+
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
529+
return;
530+
}
531+
#endif
532+
ompi_osc_rdma_sync_rdma_inc_always (rdma_sync);
533+
}
534+
535+
/**
536+
* @brief decrement the outstanding rdma operation counter (atomic)
537+
*
538+
* @param[in] rdma_sync osc rdma synchronization object
539+
*/
540+
static inline void ompi_osc_rdma_sync_rdma_dec_always (ompi_osc_rdma_sync_t *rdma_sync)
541+
{
542+
opal_atomic_wmb ();
543+
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, -1);
544+
545+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations",
546+
(unsigned long) rdma_sync->outstanding_rdma.counter);
547+
}
548+
549+
static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
550+
{
551+
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
552+
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
553+
return;
554+
}
555+
#endif
556+
ompi_osc_rdma_sync_rdma_dec_always (rdma_sync);
557+
}
558+
473559
/**
474560
* @brief complete all outstanding rdma operations to all peers
475561
*
476562
* @param[in] module osc rdma module
477563
*/
478564
static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
479565
{
480-
ompi_osc_rdma_aggregation_t *aggregation, *next;
481-
482566
if (opal_list_get_size (&sync->aggregations)) {
567+
ompi_osc_rdma_aggregation_t *aggregation, *next;
568+
483569
OPAL_THREAD_SCOPED_LOCK(&sync->lock,
484570
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
571+
fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer);
485572
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
486573
});
487574
}
488575

576+
#if !defined(BTL_VERSION) || (BTL_VERSION < 310)
489577
do {
490578
opal_progress ();
491-
} while (sync->outstanding_rdma);
579+
} while (ompi_osc_rdma_sync_get_count (sync));
580+
#else
581+
mca_btl_base_module_t *btl_module = sync->module->selected_btl;
582+
583+
do {
584+
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
585+
opal_progress ();
586+
} else {
587+
btl_module->btl_flush (btl_module, NULL);
588+
}
589+
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
590+
#endif
492591
}
493592

494593
/**

0 commit comments

Comments
 (0)