diff --git a/opal/mca/memory/patcher/memory_patcher_component.c b/opal/mca/memory/patcher/memory_patcher_component.c index 991bc3c8529..5504c9c841c 100644 --- a/opal/mca/memory/patcher/memory_patcher_component.c +++ b/opal/mca/memory/patcher/memory_patcher_component.c @@ -59,11 +59,6 @@ static int patcher_query (int *); static int mca_memory_patcher_priority; -/* NTH: we can't currently allow madvise to be intercepted due to a - * deadlock when running with glibc. In the future, we may re-enable - * this hook if the deadlock can be resolved. */ -#define WANT_INTERCEPT_MADVISE 0 - opal_memory_patcher_component_t mca_memory_patcher_component = { .super = { .memoryc_version = { @@ -249,7 +244,6 @@ static void *intercept_mremap (void *start, size_t oldlen, void *new_address, si #endif -#if WANT_INTERCEPT_MADVISE #if defined (SYS_madvise) static int (*original_madvise) (void *, size_t, int); @@ -284,7 +278,6 @@ static int intercept_madvise (void *start, size_t length, int advice) } #endif -#endif // WANT_INTERCEPT_MADVISE #if defined SYS_brk @@ -503,14 +496,12 @@ static int patcher_open (void) } #endif -#if WANT_INTERCEPT_MADVISE #if defined (SYS_madvise) rc = opal_patcher->patch_symbol ("madvise", (uintptr_t)intercept_madvise, (uintptr_t *) &original_madvise); if (OPAL_SUCCESS != rc) { return rc; } #endif -#endif // WANT_INTERCEPT_MADVISE #if defined(SYS_shmdt) && defined(__linux__) rc = opal_patcher->patch_symbol ("shmdt", (uintptr_t) intercept_shmdt, (uintptr_t *) &original_shmdt); diff --git a/opal/mca/rcache/base/base.h b/opal/mca/rcache/base/base.h index a36b7bd0799..37526814a46 100644 --- a/opal/mca/rcache/base/base.h +++ b/opal/mca/rcache/base/base.h @@ -58,6 +58,12 @@ OPAL_DECLSPEC mca_rcache_base_component_t *mca_rcache_base_component_lookup(cons OPAL_DECLSPEC mca_rcache_base_module_t *mca_rcache_base_module_lookup (const char *name); OPAL_DECLSPEC int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module); +extern opal_free_list_t mca_rcache_base_vma_tree_items; +extern bool mca_rcache_base_vma_tree_items_inited; +extern unsigned int mca_rcache_base_vma_tree_items_min; +extern int mca_rcache_base_vma_tree_items_max; +extern unsigned int mca_rcache_base_vma_tree_items_inc; + /* only used within base -- no need to DECLSPEC */ extern int mca_rcache_base_used_mem_hooks; diff --git a/opal/mca/rcache/base/rcache_base_create.c b/opal/mca/rcache/base/rcache_base_create.c index efbc09fc161..b41e315ced7 100644 --- a/opal/mca/rcache/base/rcache_base_create.c +++ b/opal/mca/rcache/base/rcache_base_create.c @@ -29,6 +29,7 @@ #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" #include "opal/mca/rcache/base/rcache_base_mem_cb.h" +#include "rcache_base_vma_tree.h" #include "opal/util/show_help.h" #include "opal/util/proc.h" diff --git a/opal/mca/rcache/base/rcache_base_frame.c b/opal/mca/rcache/base/rcache_base_frame.c index 339e0e77aa1..2b4845f087b 100644 --- a/opal/mca/rcache/base/rcache_base_frame.c +++ b/opal/mca/rcache/base/rcache_base_frame.c @@ -27,12 +27,17 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_pvar.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" #include "opal/memoryhooks/memory.h" #include "opal/constants.h" #include "rcache_base_mem_cb.h" +/* two-level macro for stringifying a number */ +#define STRINGIFYX(x) #x +#define STRINGIFY(x) STRINGIFYX(x) + /* * The following file was created by configure. It contains extern * statements and the definition of an array of pointers to each @@ -59,12 +64,19 @@ static void mca_rcache_base_registration_constructor( mca_rcache_base_registrati OBJ_CLASS_INSTANCE(mca_rcache_base_registration_t, opal_free_list_item_t, mca_rcache_base_registration_constructor, NULL); +#define TREE_ITEMS_MIN 2048 +#define TREE_ITEMS_MAX 16384 +#define TREE_ITEMS_INC 2048 /* * Global variables */ opal_list_t mca_rcache_base_modules = {{0}}; - +opal_free_list_t mca_rcache_base_vma_tree_items = {{0}}; +bool mca_rcache_base_vma_tree_items_inited = false; +unsigned int mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN; +int mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX; +unsigned int mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC; OBJ_CLASS_INSTANCE(mca_rcache_base_selected_module_t, opal_list_item_t, NULL, NULL); @@ -100,8 +112,11 @@ static int mca_rcache_base_close(void) time between now and end of application (even post main()!) */ (void) mca_base_framework_close (&opal_memory_base_framework); } - /* All done */ + OBJ_DESTRUCT(&mca_rcache_base_vma_tree_items); + mca_rcache_base_vma_tree_items_inited = false; + + /* All done */ /* Close all remaining available components */ return mca_base_framework_components_close(&opal_rcache_base_framework, NULL); } @@ -117,11 +132,42 @@ static int mca_rcache_base_open(mca_base_open_flag_t flags) OBJ_CONSTRUCT(&mca_rcache_base_modules, opal_list_t); + /* the free list is only initialized when a VMA tree is created */ + OBJ_CONSTRUCT(&mca_rcache_base_vma_tree_items, opal_free_list_t); + /* Open up all available components */ return mca_base_framework_components_open(&opal_rcache_base_framework, flags); } -MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache", NULL, +static int mca_rcache_base_register_mca_variables (mca_base_register_flag_t flags) +{ + + mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN; + (void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_min", + "Minimum number of VMA tree items to allocate (default: " + STRINGIFY(TREE_ITEMS_MIN) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT, + NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_min); + + mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX; + (void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_max", + "Maximum number of VMA tree items to allocate (default: " + STRINGIFY(TREE_ITEMS_MAX) ", -1: unlimited)", MCA_BASE_VAR_TYPE_INT, + NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_max); + + mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC; + (void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_inc", + "Number of VMA tree items to allocate at a time (default: " + STRINGIFY(TREE_ITEMS_INC) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT, + NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6, + MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_inc); + + return OPAL_SUCCESS; +} + +MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache", + mca_rcache_base_register_mca_variables, mca_rcache_base_open, mca_rcache_base_close, mca_rcache_base_static_components, 0); diff --git a/opal/mca/rcache/base/rcache_base_vma.c b/opal/mca/rcache/base/rcache_base_vma.c index fb776c4ca97..2b558881cbc 100644 --- a/opal/mca/rcache/base/rcache_base_vma.c +++ b/opal/mca/rcache/base/rcache_base_vma.c @@ -14,7 +14,7 @@ * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2009 IBM Corporation. All rights reserved. * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights * reserved. * * $COPYRIGHT$ @@ -31,6 +31,7 @@ #include "opal/mca/rcache/rcache.h" #include "rcache_base_vma.h" #include "rcache_base_vma_tree.h" +#include "opal/mca/rcache/base/base.h" /** * Initialize the rcache @@ -52,6 +53,14 @@ OBJ_CLASS_INSTANCE(mca_rcache_base_vma_module_t, opal_object_t, mca_rcache_base_vma_module_t *mca_rcache_base_vma_module_alloc (void) { + if (!mca_rcache_base_vma_tree_items_inited) { + opal_free_list_init (&mca_rcache_base_vma_tree_items, sizeof (mca_rcache_base_vma_item_t), + 8, OBJ_CLASS(mca_rcache_base_vma_item_t), 0, 8, + mca_rcache_base_vma_tree_items_min, mca_rcache_base_vma_tree_items_max, + mca_rcache_base_vma_tree_items_inc, NULL, 0, NULL, NULL, NULL); + mca_rcache_base_vma_tree_items_inited = true; + } + return OBJ_NEW(mca_rcache_base_vma_module_t); } diff --git a/opal/mca/rcache/base/rcache_base_vma_tree.c b/opal/mca/rcache/base/rcache_base_vma_tree.c index a9f4375958e..3099571b400 100644 --- a/opal/mca/rcache/base/rcache_base_vma_tree.c +++ b/opal/mca/rcache/base/rcache_base_vma_tree.c @@ -29,12 +29,14 @@ #include "opal/util/output.h" #include "rcache_base_vma_tree.h" +#include "opal/mca/rcache/base/base.h" OBJ_CLASS_INSTANCE(mca_rcache_base_vma_reg_list_item_t, opal_list_item_t, NULL, NULL); static void mca_rcache_base_vma_item_construct (mca_rcache_base_vma_item_t *vma_item) { OBJ_CONSTRUCT(&vma_item->reg_list, opal_list_t); + vma_item->in_use = false; } static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_item) @@ -42,7 +44,7 @@ static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_i OPAL_LIST_DESTRUCT(&vma_item->reg_list); } -OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_list_item_t, +OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_free_list_item_t, mca_rcache_base_vma_item_construct, mca_rcache_base_vma_item_destruct); @@ -116,8 +118,7 @@ static inline mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_t *vma_module, uintptr_t start, uintptr_t end) { - mca_rcache_base_vma_item_t *vma_item = OBJ_NEW(mca_rcache_base_vma_item_t); - + mca_rcache_base_vma_item_t *vma_item = (mca_rcache_base_vma_item_t *) opal_free_list_get (&mca_rcache_base_vma_tree_items); if (NULL == vma_item) { return NULL; } @@ -131,6 +132,16 @@ mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_ return vma_item; } +static void mca_rcache_base_vma_return (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_item_t *vma_item) +{ + opal_list_item_t *item; + while (NULL != (item = opal_list_remove_first (&vma_item->reg_list))) { + OBJ_RELEASE(item); + } + + opal_free_list_return (&mca_rcache_base_vma_tree_items, &vma_item->super); +} + static inline int mca_rcache_base_vma_compare_regs (mca_rcache_base_registration_t *reg1, mca_rcache_base_registration_t *reg2) { @@ -186,7 +197,7 @@ static inline void mca_rcache_base_vma_remove_reg (mca_rcache_base_vma_item_t *v mca_rcache_base_vma_reg_list_item_t *item; OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) { - if(item->reg == reg) { + if (item->reg == reg) { opal_list_remove_item(&vma_item->reg_list, &item->super); OBJ_RELEASE(item); break; @@ -388,7 +399,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, /* all the registrations in the vma may be deleted by the callback so keep a * reference until we are done with it. */ - OBJ_RETAIN(vma); + vma->in_use = true; OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) { rc = callback_fn (vma_item->reg, ctx); @@ -397,7 +408,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, } } - OBJ_RELEASE(vma); + vma->in_use = false; if (OPAL_SUCCESS != rc) { break; @@ -419,12 +430,26 @@ static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t * * into deadlock problems with some libc versions. The caller MUST hold the vma_lock * when calling this function. */ -static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module) +static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module, int depth) { - opal_list_item_t *item; + mca_rcache_base_vma_item_t *item; + + while (NULL != (item = (mca_rcache_base_vma_item_t *) opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) { + if (OPAL_UNLIKELY(item->in_use)) { + /* another thread is currently iterating on this vma and its registrations */ + if (depth < 8) { + /* try to clean up additional vmas before returning */ + mca_rcache_base_vma_cleanup (vma_module, depth + 1); + } - while (NULL != (item = opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) { - OBJ_RELEASE(item); + if (item->in_use) { + /* will clean it up later */ + opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &item->super.super); + return; + } + } + + mca_rcache_base_vma_return (vma_module, (mca_rcache_base_vma_item_t *) item); } } @@ -434,7 +459,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_item_t *i; uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound; - mca_rcache_base_vma_cleanup (vma_module); + mca_rcache_base_vma_cleanup (vma_module, 0); opal_mutex_lock (&vma_module->vma_lock); @@ -448,7 +473,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, while (begin <= end) { mca_rcache_base_vma_item_t *vma = NULL; - if (opal_list_get_end (&vma_module->vma_list) == &i->super) { + if (opal_list_get_end (&vma_module->vma_list) == &i->super.super) { if (mca_rcache_base_vma_can_insert (vma_module, end - begin + 1, limit)) { vma = mca_rcache_base_vma_new(vma_module, begin, end); } @@ -459,7 +484,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_update_byte_count (vma_module, end - begin + 1); - opal_list_append(&vma_module->vma_list, &vma->super); + opal_list_append(&vma_module->vma_list, &vma->super.super); begin = vma->end + 1; mca_rcache_base_vma_add_reg (vma, reg); opal_mutex_unlock (&vma_module->vma_lock); @@ -479,7 +504,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_update_byte_count (vma_module, tend - begin + 1); /* insert before */ - opal_list_insert_pos(&vma_module->vma_list, &i->super, &vma->super); + opal_list_insert_pos(&vma_module->vma_list, &i->super.super, &vma->super.super); i = vma; begin = vma->end + 1; mca_rcache_base_vma_add_reg (vma, reg); @@ -497,7 +522,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, /* add after */ opal_list_insert_pos (&vma_module->vma_list, opal_list_get_next (&i->super), - &vma->super); + &vma->super.super); mca_rcache_base_vma_add_reg (i, reg); begin = end + 1; } else { @@ -517,8 +542,8 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, /* add after */ opal_list_insert_pos (&vma_module->vma_list, - opal_list_get_next (&i->super), - &vma->super); + opal_list_get_next (&i->super.super), + &vma->super.super); } i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super); @@ -569,15 +594,15 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module, opal_rb_tree_delete (&vma_module->rb_tree, vma); mca_rcache_base_vma_update_byte_count (vma_module, vma->start - vma->end - 1); - opal_list_remove_item (&vma_module->vma_list, &vma->super); - opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super); + opal_list_remove_item (&vma_module->vma_list, &vma->super.super); + opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super); vma = next; } else { int merged; do { mca_rcache_base_vma_item_t *prev = NULL, *next = NULL; - if (opal_list_get_first (&vma_module->vma_list) != &vma->super) { + if (opal_list_get_first (&vma_module->vma_list) != &vma->super.super) { prev = (mca_rcache_base_vma_item_t *) opal_list_get_prev(vma); } @@ -586,23 +611,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module, if (prev && vma->start == prev->end + 1 && mca_rcache_base_vma_compare_reg_lists(vma, prev)) { prev->end = vma->end; - opal_list_remove_item(&vma_module->vma_list, &vma->super); + opal_list_remove_item(&vma_module->vma_list, &vma->super.super); opal_rb_tree_delete(&vma_module->rb_tree, vma); - opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super); + opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super); vma = prev; merged = 1; } - if (opal_list_get_last (&vma_module->vma_list) != &vma->super) { + if (opal_list_get_last (&vma_module->vma_list) != &vma->super.super) { next = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma); } if (next && vma->end + 1 == next->start && mca_rcache_base_vma_compare_reg_lists (vma, next)) { vma->end = next->end; - opal_list_remove_item(&vma_module->vma_list, &next->super); + opal_list_remove_item(&vma_module->vma_list, &next->super.super); opal_rb_tree_delete(&vma_module->rb_tree, next); - opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super); + opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super.super); merged = 1; } } while (merged); diff --git a/opal/mca/rcache/base/rcache_base_vma_tree.h b/opal/mca/rcache/base/rcache_base_vma_tree.h index 2c4f0b92ab6..0e4185e0647 100644 --- a/opal/mca/rcache/base/rcache_base_vma_tree.h +++ b/opal/mca/rcache/base/rcache_base_vma_tree.h @@ -15,7 +15,7 @@ * Copyright (c) 2009 IBM Corporation. All rights reserved. * * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -52,10 +52,11 @@ OBJ_CLASS_DECLARATION(mca_rcache_base_vma_reg_list_item_t); */ struct mca_rcache_base_vma_item_t { - opal_list_item_t super; /**< the parent class */ + opal_free_list_item_t super; /**< the parent class */ uintptr_t start; /**< the base of the memory range */ uintptr_t end; /**< the bound of the memory range */ opal_list_t reg_list; /**< list of regs on this vma */ + bool in_use; /**< vma is in use in iterate */ mca_rcache_base_vma_module_t *vma_module; /**< pointer to rcache vma belongs to */ }; typedef struct mca_rcache_base_vma_item_t mca_rcache_base_vma_item_t;