Skip to content

Commit d0c4538

Browse files
committed
rcache/base: reduce probability of deadlock when hooking madvise
The current VMA cache implementation backing rcache/grdma can run into a deadlock situation in multi-threaded code when madvise is hooked and the c library uses locks. In this case we may run into the following situation: Thread 1: ... free () <- Holding libc lock madvice_hook () vma_iteration () <- Blocked waiting for vma lock Thread 2: ... vma_insert () <- Holding vma lock vma_item_new () malloc () <- Blocked waiting for libc lock To fix this problem we chose to remove the madvise () hook but that fix is causing issue #3685. This commit aims to greatly reduce the chance that the deadlock will be hit by putting vma items into a free list. This moves the allocation outside the vma lock. In general there are a relatively small number of vma items so the default is to allocate 2048 vma items. This default is configurable but it is likely the number is too large not too small. Signed-off-by: Nathan Hjelm <[email protected]>
1 parent e79eb85 commit d0c4538

File tree

6 files changed

+120
-31
lines changed

6 files changed

+120
-31
lines changed

opal/mca/rcache/base/base.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ OPAL_DECLSPEC mca_rcache_base_component_t *mca_rcache_base_component_lookup(cons
5858
OPAL_DECLSPEC mca_rcache_base_module_t *mca_rcache_base_module_lookup (const char *name);
5959
OPAL_DECLSPEC int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module);
6060

61+
extern opal_free_list_t mca_rcache_base_vma_tree_items;
62+
extern bool mca_rcache_base_vma_tree_items_inited;
63+
extern unsigned int mca_rcache_base_vma_tree_items_min;
64+
extern int mca_rcache_base_vma_tree_items_max;
65+
extern unsigned int mca_rcache_base_vma_tree_items_inc;
66+
6167
/* only used within base -- no need to DECLSPEC */
6268
extern int mca_rcache_base_used_mem_hooks;
6369

opal/mca/rcache/base/rcache_base_create.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "opal/mca/rcache/rcache.h"
3030
#include "opal/mca/rcache/base/base.h"
3131
#include "opal/mca/rcache/base/rcache_base_mem_cb.h"
32+
#include "rcache_base_vma_tree.h"
3233
#include "opal/util/show_help.h"
3334
#include "opal/util/proc.h"
3435

opal/mca/rcache/base/rcache_base_frame.c

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,17 @@
2727

2828
#include "opal/mca/mca.h"
2929
#include "opal/mca/base/base.h"
30+
#include "opal/mca/base/mca_base_pvar.h"
3031
#include "opal/mca/rcache/rcache.h"
3132
#include "opal/mca/rcache/base/base.h"
3233
#include "opal/memoryhooks/memory.h"
3334
#include "opal/constants.h"
3435
#include "rcache_base_mem_cb.h"
3536

37+
/* two-level macro for stringifying a number */
38+
#define STRINGIFYX(x) #x
39+
#define STRINGIFY(x) STRINGIFYX(x)
40+
3641
/*
3742
* The following file was created by configure. It contains extern
3843
* statements and the definition of an array of pointers to each
@@ -59,12 +64,19 @@ static void mca_rcache_base_registration_constructor( mca_rcache_base_registrati
5964
OBJ_CLASS_INSTANCE(mca_rcache_base_registration_t, opal_free_list_item_t,
6065
mca_rcache_base_registration_constructor, NULL);
6166

67+
#define TREE_ITEMS_MIN 2048
68+
#define TREE_ITEMS_MAX 16384
69+
#define TREE_ITEMS_INC 2048
6270

6371
/*
6472
* Global variables
6573
*/
6674
opal_list_t mca_rcache_base_modules = {{0}};
67-
75+
opal_free_list_t mca_rcache_base_vma_tree_items = {{0}};
76+
bool mca_rcache_base_vma_tree_items_inited = false;
77+
unsigned int mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN;
78+
int mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX;
79+
unsigned int mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC;
6880

6981
OBJ_CLASS_INSTANCE(mca_rcache_base_selected_module_t, opal_list_item_t, NULL, NULL);
7082

@@ -100,8 +112,11 @@ static int mca_rcache_base_close(void)
100112
time between now and end of application (even post main()!) */
101113
(void) mca_base_framework_close (&opal_memory_base_framework);
102114
}
103-
/* All done */
104115

116+
OBJ_DESTRUCT(&mca_rcache_base_vma_tree_items);
117+
mca_rcache_base_vma_tree_items_inited = false;
118+
119+
/* All done */
105120
/* Close all remaining available components */
106121
return mca_base_framework_components_close(&opal_rcache_base_framework, NULL);
107122
}
@@ -117,11 +132,42 @@ static int mca_rcache_base_open(mca_base_open_flag_t flags)
117132

118133
OBJ_CONSTRUCT(&mca_rcache_base_modules, opal_list_t);
119134

135+
/* the free list is only initialized when a VMA tree is created */
136+
OBJ_CONSTRUCT(&mca_rcache_base_vma_tree_items, opal_free_list_t);
137+
120138
/* Open up all available components */
121139
return mca_base_framework_components_open(&opal_rcache_base_framework, flags);
122140
}
123141

124-
MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache", NULL,
142+
static int mca_rcache_base_register_mca_variables (mca_base_register_flag_t flags)
143+
{
144+
145+
mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN;
146+
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_min",
147+
"Minimum number of VMA tree items to allocate (default: "
148+
STRINGIFY(TREE_ITEMS_MIN) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
149+
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
150+
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_min);
151+
152+
mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX;
153+
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_max",
154+
"Maximum number of VMA tree items to allocate (default: "
155+
STRINGIFY(TREE_ITEMS_MAX) ", -1: unlimited)", MCA_BASE_VAR_TYPE_INT,
156+
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
157+
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_max);
158+
159+
mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC;
160+
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_inc",
161+
"Number of VMA tree items to allocate at a time (default: "
162+
STRINGIFY(TREE_ITEMS_INC) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
163+
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
164+
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_inc);
165+
166+
return OPAL_SUCCESS;
167+
}
168+
169+
MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache",
170+
mca_rcache_base_register_mca_variables,
125171
mca_rcache_base_open, mca_rcache_base_close,
126172
mca_rcache_base_static_components, 0);
127173

opal/mca/rcache/base/rcache_base_vma.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2009 IBM Corporation. All rights reserved.
1616
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
17-
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
*
2020
* $COPYRIGHT$
@@ -31,6 +31,7 @@
3131
#include "opal/mca/rcache/rcache.h"
3232
#include "rcache_base_vma.h"
3333
#include "rcache_base_vma_tree.h"
34+
#include "opal/mca/rcache/base/base.h"
3435

3536
/**
3637
* Initialize the rcache
@@ -52,6 +53,14 @@ OBJ_CLASS_INSTANCE(mca_rcache_base_vma_module_t, opal_object_t,
5253

5354
mca_rcache_base_vma_module_t *mca_rcache_base_vma_module_alloc (void)
5455
{
56+
if (!mca_rcache_base_vma_tree_items_inited) {
57+
opal_free_list_init (&mca_rcache_base_vma_tree_items, sizeof (mca_rcache_base_vma_item_t),
58+
8, OBJ_CLASS(mca_rcache_base_vma_item_t), 0, 8,
59+
mca_rcache_base_vma_tree_items_min, mca_rcache_base_vma_tree_items_max,
60+
mca_rcache_base_vma_tree_items_inc, NULL, 0, NULL, NULL, NULL);
61+
mca_rcache_base_vma_tree_items_inited = true;
62+
}
63+
5564
return OBJ_NEW(mca_rcache_base_vma_module_t);
5665
}
5766

opal/mca/rcache/base/rcache_base_vma_tree.c

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,22 @@
2929

3030
#include "opal/util/output.h"
3131
#include "rcache_base_vma_tree.h"
32+
#include "opal/mca/rcache/base/base.h"
3233

3334
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_reg_list_item_t, opal_list_item_t, NULL, NULL);
3435

3536
static void mca_rcache_base_vma_item_construct (mca_rcache_base_vma_item_t *vma_item)
3637
{
3738
OBJ_CONSTRUCT(&vma_item->reg_list, opal_list_t);
39+
vma_item->in_use = false;
3840
}
3941

4042
static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_item)
4143
{
4244
OPAL_LIST_DESTRUCT(&vma_item->reg_list);
4345
}
4446

45-
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_list_item_t,
47+
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_free_list_item_t,
4648
mca_rcache_base_vma_item_construct,
4749
mca_rcache_base_vma_item_destruct);
4850

@@ -116,8 +118,7 @@ static inline
116118
mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_t *vma_module,
117119
uintptr_t start, uintptr_t end)
118120
{
119-
mca_rcache_base_vma_item_t *vma_item = OBJ_NEW(mca_rcache_base_vma_item_t);
120-
121+
mca_rcache_base_vma_item_t *vma_item = (mca_rcache_base_vma_item_t *) opal_free_list_get (&mca_rcache_base_vma_tree_items);
121122
if (NULL == vma_item) {
122123
return NULL;
123124
}
@@ -131,6 +132,17 @@ mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_
131132
return vma_item;
132133
}
133134

135+
/* NTH: this function MUST not allocate or deallocate memory */
136+
static void mca_rcache_base_vma_return (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_item_t *vma_item)
137+
{
138+
opal_list_item_t *item;
139+
while (NULL != (item = opal_list_remove_first (&vma_item->reg_list))) {
140+
OBJ_RELEASE(item);
141+
}
142+
143+
opal_free_list_return (&mca_rcache_base_vma_tree_items, &vma_item->super);
144+
}
145+
134146
static inline int mca_rcache_base_vma_compare_regs (mca_rcache_base_registration_t *reg1,
135147
mca_rcache_base_registration_t *reg2)
136148
{
@@ -186,7 +198,7 @@ static inline void mca_rcache_base_vma_remove_reg (mca_rcache_base_vma_item_t *v
186198
mca_rcache_base_vma_reg_list_item_t *item;
187199

188200
OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) {
189-
if(item->reg == reg) {
201+
if (item->reg == reg) {
190202
opal_list_remove_item(&vma_item->reg_list, &item->super);
191203
OBJ_RELEASE(item);
192204
break;
@@ -388,7 +400,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
388400

389401
/* all the registrations in the vma may be deleted by the callback so keep a
390402
* reference until we are done with it. */
391-
OBJ_RETAIN(vma);
403+
vma->in_use = true;
392404

393405
OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
394406
rc = callback_fn (vma_item->reg, ctx);
@@ -397,7 +409,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
397409
}
398410
}
399411

400-
OBJ_RELEASE(vma);
412+
vma->in_use = false;
401413

402414
if (OPAL_SUCCESS != rc) {
403415
break;
@@ -419,12 +431,26 @@ static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *
419431
* into deadlock problems with some libc versions. The caller MUST hold the vma_lock
420432
* when calling this function.
421433
*/
422-
static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module)
434+
static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module, int depth)
423435
{
424-
opal_list_item_t *item;
436+
mca_rcache_base_vma_item_t *item;
437+
438+
while (NULL != (item = (mca_rcache_base_vma_item_t *) opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) {
439+
if (OPAL_UNLIKELY(item->in_use)) {
440+
/* another thread is currently iterating on this vma and its registrations */
441+
if (depth < 8) {
442+
/* try to clean up additional vmas before returning */
443+
mca_rcache_base_vma_cleanup (vma_module, depth + 1);
444+
}
425445

426-
while (NULL != (item = opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) {
427-
OBJ_RELEASE(item);
446+
if (item->in_use) {
447+
/* will clean it up later */
448+
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &item->super.super);
449+
return;
450+
}
451+
}
452+
453+
mca_rcache_base_vma_return (vma_module, (mca_rcache_base_vma_item_t *) item);
428454
}
429455
}
430456

@@ -434,7 +460,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
434460
mca_rcache_base_vma_item_t *i;
435461
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;
436462

437-
mca_rcache_base_vma_cleanup (vma_module);
463+
mca_rcache_base_vma_cleanup (vma_module, 0);
438464

439465
opal_mutex_lock (&vma_module->vma_lock);
440466

@@ -448,7 +474,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
448474
while (begin <= end) {
449475
mca_rcache_base_vma_item_t *vma = NULL;
450476

451-
if (opal_list_get_end (&vma_module->vma_list) == &i->super) {
477+
if (opal_list_get_end (&vma_module->vma_list) == &i->super.super) {
452478
if (mca_rcache_base_vma_can_insert (vma_module, end - begin + 1, limit)) {
453479
vma = mca_rcache_base_vma_new(vma_module, begin, end);
454480
}
@@ -459,7 +485,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
459485

460486
mca_rcache_base_vma_update_byte_count (vma_module, end - begin + 1);
461487

462-
opal_list_append(&vma_module->vma_list, &vma->super);
488+
opal_list_append(&vma_module->vma_list, &vma->super.super);
463489
begin = vma->end + 1;
464490
mca_rcache_base_vma_add_reg (vma, reg);
465491
opal_mutex_unlock (&vma_module->vma_lock);
@@ -479,7 +505,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
479505
mca_rcache_base_vma_update_byte_count (vma_module, tend - begin + 1);
480506

481507
/* insert before */
482-
opal_list_insert_pos(&vma_module->vma_list, &i->super, &vma->super);
508+
opal_list_insert_pos(&vma_module->vma_list, &i->super.super, &vma->super.super);
483509
i = vma;
484510
begin = vma->end + 1;
485511
mca_rcache_base_vma_add_reg (vma, reg);
@@ -497,7 +523,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
497523
/* add after */
498524
opal_list_insert_pos (&vma_module->vma_list,
499525
opal_list_get_next (&i->super),
500-
&vma->super);
526+
&vma->super.super);
501527
mca_rcache_base_vma_add_reg (i, reg);
502528
begin = end + 1;
503529
} else {
@@ -517,8 +543,8 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
517543

518544
/* add after */
519545
opal_list_insert_pos (&vma_module->vma_list,
520-
opal_list_get_next (&i->super),
521-
&vma->super);
546+
opal_list_get_next (&i->super.super),
547+
&vma->super.super);
522548
}
523549

524550
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
@@ -569,15 +595,15 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
569595
opal_rb_tree_delete (&vma_module->rb_tree, vma);
570596
mca_rcache_base_vma_update_byte_count (vma_module,
571597
vma->start - vma->end - 1);
572-
opal_list_remove_item (&vma_module->vma_list, &vma->super);
573-
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super);
598+
opal_list_remove_item (&vma_module->vma_list, &vma->super.super);
599+
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super);
574600
vma = next;
575601
} else {
576602
int merged;
577603

578604
do {
579605
mca_rcache_base_vma_item_t *prev = NULL, *next = NULL;
580-
if (opal_list_get_first (&vma_module->vma_list) != &vma->super) {
606+
if (opal_list_get_first (&vma_module->vma_list) != &vma->super.super) {
581607
prev = (mca_rcache_base_vma_item_t *) opal_list_get_prev(vma);
582608
}
583609

@@ -586,23 +612,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
586612
if (prev && vma->start == prev->end + 1 &&
587613
mca_rcache_base_vma_compare_reg_lists(vma, prev)) {
588614
prev->end = vma->end;
589-
opal_list_remove_item(&vma_module->vma_list, &vma->super);
615+
opal_list_remove_item(&vma_module->vma_list, &vma->super.super);
590616
opal_rb_tree_delete(&vma_module->rb_tree, vma);
591-
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super);
617+
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super);
592618
vma = prev;
593619
merged = 1;
594620
}
595621

596-
if (opal_list_get_last (&vma_module->vma_list) != &vma->super) {
622+
if (opal_list_get_last (&vma_module->vma_list) != &vma->super.super) {
597623
next = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma);
598624
}
599625

600626
if (next && vma->end + 1 == next->start &&
601627
mca_rcache_base_vma_compare_reg_lists (vma, next)) {
602628
vma->end = next->end;
603-
opal_list_remove_item(&vma_module->vma_list, &next->super);
629+
opal_list_remove_item(&vma_module->vma_list, &next->super.super);
604630
opal_rb_tree_delete(&vma_module->rb_tree, next);
605-
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super);
631+
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super.super);
606632
merged = 1;
607633
}
608634
} while (merged);

opal/mca/rcache/base/rcache_base_vma_tree.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
* Copyright (c) 2009 IBM Corporation. All rights reserved.
1616
*
1717
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
18-
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
18+
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
1919
* reserved.
2020
* $COPYRIGHT$
2121
*
@@ -52,10 +52,11 @@ OBJ_CLASS_DECLARATION(mca_rcache_base_vma_reg_list_item_t);
5252
*/
5353
struct mca_rcache_base_vma_item_t
5454
{
55-
opal_list_item_t super; /**< the parent class */
55+
opal_free_list_item_t super; /**< the parent class */
5656
uintptr_t start; /**< the base of the memory range */
5757
uintptr_t end; /**< the bound of the memory range */
5858
opal_list_t reg_list; /**< list of regs on this vma */
59+
bool in_use; /**< vma is in use in iterate */
5960
mca_rcache_base_vma_module_t *vma_module; /**< pointer to rcache vma belongs to */
6061
};
6162
typedef struct mca_rcache_base_vma_item_t mca_rcache_base_vma_item_t;

0 commit comments

Comments
 (0)