Skip to content

Commit

Permalink
rcache/base: reduce probability of deadlock when hooking madvise
Browse files Browse the repository at this point in the history
The current VMA cache implementation backing rcache/grdma can run into
a deadlock situation in multi-threaded code when madvise is hooked and
the c library uses locks. In this case we may run into the following
situation:

Thread 1:

    ...
    free ()           <- Holding libc lock
    madvice_hook ()
    vma_iteration ()  <- Blocked waiting for vma lock

Thread 2:
    ...
    vma_insert ()     <- Holding vma lock
    vma_item_new ()
    malloc ()         <- Blocked waiting for libc lock

To fix this problem we chose to remove the madvise () hook but that
fix is causing issue #3685. This commit aims to greatly reduce the
chance that the deadlock will be hit by putting vma items into a free
list. This moves the allocation outside the vma lock. In general there
are a relatively small number of vma items so the default is to
allocate 2048 vma items. This default is configurable but it is likely
the number is too large not too small.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
  • Loading branch information
hjelmn committed Aug 7, 2017
1 parent 8e0ca63 commit b6bf3f4
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 31 deletions.
6 changes: 6 additions & 0 deletions opal/mca/rcache/base/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ OPAL_DECLSPEC mca_rcache_base_component_t *mca_rcache_base_component_lookup(cons
OPAL_DECLSPEC mca_rcache_base_module_t *mca_rcache_base_module_lookup (const char *name);
OPAL_DECLSPEC int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module);

extern opal_free_list_t mca_rcache_base_vma_tree_items;
extern bool mca_rcache_base_vma_tree_items_inited;
extern unsigned int mca_rcache_base_vma_tree_items_min;
extern int mca_rcache_base_vma_tree_items_max;
extern unsigned int mca_rcache_base_vma_tree_items_inc;

/* only used within base -- no need to DECLSPEC */
extern int mca_rcache_base_used_mem_hooks;

Expand Down
1 change: 1 addition & 0 deletions opal/mca/rcache/base/rcache_base_create.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "opal/mca/rcache/rcache.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/rcache/base/rcache_base_mem_cb.h"
#include "rcache_base_vma_tree.h"
#include "opal/util/show_help.h"
#include "opal/util/proc.h"

Expand Down
52 changes: 49 additions & 3 deletions opal/mca/rcache/base/rcache_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,17 @@

#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_pvar.h"
#include "opal/mca/rcache/rcache.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/memoryhooks/memory.h"
#include "opal/constants.h"
#include "rcache_base_mem_cb.h"

/* two-level macro for stringifying a number */
#define STRINGIFYX(x) #x
#define STRINGIFY(x) STRINGIFYX(x)

/*
* The following file was created by configure. It contains extern
* statements and the definition of an array of pointers to each
Expand All @@ -59,12 +64,19 @@ static void mca_rcache_base_registration_constructor( mca_rcache_base_registrati
OBJ_CLASS_INSTANCE(mca_rcache_base_registration_t, opal_free_list_item_t,
mca_rcache_base_registration_constructor, NULL);

#define TREE_ITEMS_MIN 2048
#define TREE_ITEMS_MAX 16384
#define TREE_ITEMS_INC 2048

/*
* Global variables
*/
opal_list_t mca_rcache_base_modules = {{0}};

opal_free_list_t mca_rcache_base_vma_tree_items = {{0}};
bool mca_rcache_base_vma_tree_items_inited = false;
unsigned int mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN;
int mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX;
unsigned int mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC;

OBJ_CLASS_INSTANCE(mca_rcache_base_selected_module_t, opal_list_item_t, NULL, NULL);

Expand Down Expand Up @@ -100,8 +112,11 @@ static int mca_rcache_base_close(void)
time between now and end of application (even post main()!) */
(void) mca_base_framework_close (&opal_memory_base_framework);
}
/* All done */

OBJ_DESTRUCT(&mca_rcache_base_vma_tree_items);
mca_rcache_base_vma_tree_items_inited = false;

/* All done */
/* Close all remaining available components */
return mca_base_framework_components_close(&opal_rcache_base_framework, NULL);
}
Expand All @@ -117,11 +132,42 @@ static int mca_rcache_base_open(mca_base_open_flag_t flags)

OBJ_CONSTRUCT(&mca_rcache_base_modules, opal_list_t);

/* the free list is only initialized when a VMA tree is created */
OBJ_CONSTRUCT(&mca_rcache_base_vma_tree_items, opal_free_list_t);

/* Open up all available components */
return mca_base_framework_components_open(&opal_rcache_base_framework, flags);
}

MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache", NULL,
static int mca_rcache_base_register_mca_variables (mca_base_register_flag_t flags)
{

mca_rcache_base_vma_tree_items_min = TREE_ITEMS_MIN;
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_min",
"Minimum number of VMA tree items to allocate (default: "
STRINGIFY(TREE_ITEMS_MIN) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_min);

mca_rcache_base_vma_tree_items_max = TREE_ITEMS_MAX;
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_max",
"Maximum number of VMA tree items to allocate (default: "
STRINGIFY(TREE_ITEMS_MAX) ", -1: unlimited)", MCA_BASE_VAR_TYPE_INT,
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_max);

mca_rcache_base_vma_tree_items_inc = TREE_ITEMS_INC;
(void) mca_base_framework_var_register (&opal_rcache_base_framework, "vma_tree_items_inc",
"Number of VMA tree items to allocate at a time (default: "
STRINGIFY(TREE_ITEMS_INC) ")", MCA_BASE_VAR_TYPE_UNSIGNED_INT,
NULL, MCA_BASE_VAR_BIND_NO_OBJECT, 0, OPAL_INFO_LVL_6,
MCA_BASE_VAR_SCOPE_READONLY, &mca_rcache_base_vma_tree_items_inc);

return OPAL_SUCCESS;
}

MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache",
mca_rcache_base_register_mca_variables,
mca_rcache_base_open, mca_rcache_base_close,
mca_rcache_base_static_components, 0);

11 changes: 10 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
Expand All @@ -31,6 +31,7 @@
#include "opal/mca/rcache/rcache.h"
#include "rcache_base_vma.h"
#include "rcache_base_vma_tree.h"
#include "opal/mca/rcache/base/base.h"

/**
* Initialize the rcache
Expand All @@ -52,6 +53,14 @@ OBJ_CLASS_INSTANCE(mca_rcache_base_vma_module_t, opal_object_t,

mca_rcache_base_vma_module_t *mca_rcache_base_vma_module_alloc (void)
{
if (!mca_rcache_base_vma_tree_items_inited) {
opal_free_list_init (&mca_rcache_base_vma_tree_items, sizeof (mca_rcache_base_vma_item_t),
8, OBJ_CLASS(mca_rcache_base_vma_item_t), 0, 8,
mca_rcache_base_vma_tree_items_min, mca_rcache_base_vma_tree_items_max,
mca_rcache_base_vma_tree_items_inc, NULL, 0, NULL, NULL, NULL);
mca_rcache_base_vma_tree_items_inited = true;
}

return OBJ_NEW(mca_rcache_base_vma_module_t);
}

Expand Down
76 changes: 51 additions & 25 deletions opal/mca/rcache/base/rcache_base_vma_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,22 @@

#include "opal/util/output.h"
#include "rcache_base_vma_tree.h"
#include "opal/mca/rcache/base/base.h"

OBJ_CLASS_INSTANCE(mca_rcache_base_vma_reg_list_item_t, opal_list_item_t, NULL, NULL);

static void mca_rcache_base_vma_item_construct (mca_rcache_base_vma_item_t *vma_item)
{
OBJ_CONSTRUCT(&vma_item->reg_list, opal_list_t);
vma_item->in_use = false;
}

static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_item)
{
OPAL_LIST_DESTRUCT(&vma_item->reg_list);
}

OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_list_item_t,
OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_free_list_item_t,
mca_rcache_base_vma_item_construct,
mca_rcache_base_vma_item_destruct);

Expand Down Expand Up @@ -116,8 +118,7 @@ static inline
mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_t *vma_module,
uintptr_t start, uintptr_t end)
{
mca_rcache_base_vma_item_t *vma_item = OBJ_NEW(mca_rcache_base_vma_item_t);

mca_rcache_base_vma_item_t *vma_item = (mca_rcache_base_vma_item_t *) opal_free_list_get (&mca_rcache_base_vma_tree_items);
if (NULL == vma_item) {
return NULL;
}
Expand All @@ -131,6 +132,17 @@ mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_
return vma_item;
}

/* NTH: this function MUST not allocate or deallocate memory */
static void mca_rcache_base_vma_return (mca_rcache_base_vma_module_t *vma_module, mca_rcache_base_vma_item_t *vma_item)
{
opal_list_item_t *item;
while (NULL != (item = opal_list_remove_first (&vma_item->reg_list))) {
OBJ_RELEASE(item);
}

opal_free_list_return (&mca_rcache_base_vma_tree_items, &vma_item->super);
}

static inline int mca_rcache_base_vma_compare_regs (mca_rcache_base_registration_t *reg1,
mca_rcache_base_registration_t *reg2)
{
Expand Down Expand Up @@ -186,7 +198,7 @@ static inline void mca_rcache_base_vma_remove_reg (mca_rcache_base_vma_item_t *v
mca_rcache_base_vma_reg_list_item_t *item;

OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if(item->reg == reg) {
if (item->reg == reg) {
opal_list_remove_item(&vma_item->reg_list, &item->super);
OBJ_RELEASE(item);
break;
Expand Down Expand Up @@ -388,7 +400,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,

/* all the registrations in the vma may be deleted by the callback so keep a
* reference until we are done with it. */
OBJ_RETAIN(vma);
vma->in_use = true;

OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
rc = callback_fn (vma_item->reg, ctx);
Expand All @@ -397,7 +409,7 @@ int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
}
}

OBJ_RELEASE(vma);
vma->in_use = false;

if (OPAL_SUCCESS != rc) {
break;
Expand All @@ -419,12 +431,26 @@ static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *
* into deadlock problems with some libc versions. The caller MUST hold the vma_lock
* when calling this function.
*/
static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module)
static void mca_rcache_base_vma_cleanup (mca_rcache_base_vma_module_t *vma_module, int depth)
{
opal_list_item_t *item;
mca_rcache_base_vma_item_t *item;

while (NULL != (item = (mca_rcache_base_vma_item_t *) opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) {
if (OPAL_UNLIKELY(item->in_use)) {
/* another thread is currently iterating on this vma and its registrations */
if (depth < 8) {
/* try to clean up additional vmas before returning */
mca_rcache_base_vma_cleanup (vma_module, depth + 1);
}

while (NULL != (item = opal_lifo_pop_atomic (&vma_module->vma_gc_lifo))) {
OBJ_RELEASE(item);
if (item->in_use) {
/* will clean it up later */
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &item->super.super);
return;
}
}

mca_rcache_base_vma_return (vma_module, (mca_rcache_base_vma_item_t *) item);
}
}

Expand All @@ -434,7 +460,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_vma_item_t *i;
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;

mca_rcache_base_vma_cleanup (vma_module);
mca_rcache_base_vma_cleanup (vma_module, 0);

opal_mutex_lock (&vma_module->vma_lock);

Expand All @@ -448,7 +474,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
while (begin <= end) {
mca_rcache_base_vma_item_t *vma = NULL;

if (opal_list_get_end (&vma_module->vma_list) == &i->super) {
if (opal_list_get_end (&vma_module->vma_list) == &i->super.super) {
if (mca_rcache_base_vma_can_insert (vma_module, end - begin + 1, limit)) {
vma = mca_rcache_base_vma_new(vma_module, begin, end);
}
Expand All @@ -459,7 +485,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,

mca_rcache_base_vma_update_byte_count (vma_module, end - begin + 1);

opal_list_append(&vma_module->vma_list, &vma->super);
opal_list_append(&vma_module->vma_list, &vma->super.super);
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
opal_mutex_unlock (&vma_module->vma_lock);
Expand All @@ -479,7 +505,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_vma_update_byte_count (vma_module, tend - begin + 1);

/* insert before */
opal_list_insert_pos(&vma_module->vma_list, &i->super, &vma->super);
opal_list_insert_pos(&vma_module->vma_list, &i->super.super, &vma->super.super);
i = vma;
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
Expand All @@ -497,7 +523,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
/* add after */
opal_list_insert_pos (&vma_module->vma_list,
opal_list_get_next (&i->super),
&vma->super);
&vma->super.super);
mca_rcache_base_vma_add_reg (i, reg);
begin = end + 1;
} else {
Expand All @@ -517,8 +543,8 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,

/* add after */
opal_list_insert_pos (&vma_module->vma_list,
opal_list_get_next (&i->super),
&vma->super);
opal_list_get_next (&i->super.super),
&vma->super.super);
}

i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
Expand Down Expand Up @@ -569,15 +595,15 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
opal_rb_tree_delete (&vma_module->rb_tree, vma);
mca_rcache_base_vma_update_byte_count (vma_module,
vma->start - vma->end - 1);
opal_list_remove_item (&vma_module->vma_list, &vma->super);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super);
opal_list_remove_item (&vma_module->vma_list, &vma->super.super);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super);
vma = next;
} else {
int merged;

do {
mca_rcache_base_vma_item_t *prev = NULL, *next = NULL;
if (opal_list_get_first (&vma_module->vma_list) != &vma->super) {
if (opal_list_get_first (&vma_module->vma_list) != &vma->super.super) {
prev = (mca_rcache_base_vma_item_t *) opal_list_get_prev(vma);
}

Expand All @@ -586,23 +612,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
if (prev && vma->start == prev->end + 1 &&
mca_rcache_base_vma_compare_reg_lists(vma, prev)) {
prev->end = vma->end;
opal_list_remove_item(&vma_module->vma_list, &vma->super);
opal_list_remove_item(&vma_module->vma_list, &vma->super.super);
opal_rb_tree_delete(&vma_module->rb_tree, vma);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &vma->super.super);
vma = prev;
merged = 1;
}

if (opal_list_get_last (&vma_module->vma_list) != &vma->super) {
if (opal_list_get_last (&vma_module->vma_list) != &vma->super.super) {
next = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma);
}

if (next && vma->end + 1 == next->start &&
mca_rcache_base_vma_compare_reg_lists (vma, next)) {
vma->end = next->end;
opal_list_remove_item(&vma_module->vma_list, &next->super);
opal_list_remove_item(&vma_module->vma_list, &next->super.super);
opal_rb_tree_delete(&vma_module->rb_tree, next);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super);
opal_lifo_push_atomic (&vma_module->vma_gc_lifo, &next->super.super);
merged = 1;
}
} while (merged);
Expand Down
5 changes: 3 additions & 2 deletions opal/mca/rcache/base/rcache_base_vma_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2009 IBM Corporation. All rights reserved.
*
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2017 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -52,10 +52,11 @@ OBJ_CLASS_DECLARATION(mca_rcache_base_vma_reg_list_item_t);
*/
struct mca_rcache_base_vma_item_t
{
opal_list_item_t super; /**< the parent class */
opal_free_list_item_t super; /**< the parent class */
uintptr_t start; /**< the base of the memory range */
uintptr_t end; /**< the bound of the memory range */
opal_list_t reg_list; /**< list of regs on this vma */
bool in_use; /**< vma is in use in iterate */
mca_rcache_base_vma_module_t *vma_module; /**< pointer to rcache vma belongs to */
};
typedef struct mca_rcache_base_vma_item_t mca_rcache_base_vma_item_t;
Expand Down

0 comments on commit b6bf3f4

Please sign in to comment.