Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v2.1 mpool/rcache rewrite #2101

Merged
merged 34 commits into from
Sep 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
dc338b0
Revert "mpool/udreg: manual patch to use mntent"
hjelmn Sep 13, 2016
fff0d33
Revert "mpool/rgpusm: update for rcache threading fixes"
hjelmn Sep 13, 2016
eaf0d04
Revert "rcache/udreg: make reference count thread safe"
hjelmn Sep 13, 2016
dde7747
Revert "rcache/grdma: fix typo in cuda code"
hjelmn Sep 13, 2016
7315cfb
Revert "rcache: fix deadlock in multi-threaded environments"
hjelmn Sep 13, 2016
08984bb
Revert "rcache/vma: add missing #include "opal/util/output.h""
hjelmn Sep 13, 2016
8e7853e
Adding memkind component to use MPI_Alloc_mem through memkind
vvenkates27 Jul 2, 2015
fecabe8
The latest version of memkind uses jemalloc as a submodule.
vvenkates27 Jul 11, 2015
c53c0d8
opal: rework mpool and rcache frameworks
hjelmn Nov 2, 2015
2d6aa79
Update memkind mpool for new mpool interface
hjelmn Nov 10, 2015
c50abd8
rcache: add major/minor/release version macros
hjelmn Dec 18, 2015
1ef2d90
btl/usnic: update for mpool/rcache rewrite
hjelmn Dec 18, 2015
a6bd396
Add missing include, remove unused vairable
Mar 15, 2016
e504399
rcache: initialize common symbol mca_rcache_base_used_mem_hooks
ggouaillardet Mar 16, 2016
8f38103
usnic: allow mpool_hints to be empty
jsquyres Mar 16, 2016
9bb1d00
rcache/grdma: fix typo
hjelmn Mar 16, 2016
5884d38
opal: fix various coverity errors
hjelmn Mar 17, 2016
6834922
rcache/grdma: do not OBJ_RELEASE vma tree too early
hjelmn Mar 17, 2016
321b1ad
opal: fix coverity issues
hjelmn Mar 18, 2016
cdd624d
btl/scif: update for mpool/rcache rewrite
hjelmn Mar 23, 2016
afa3513
Nathan missed one reference to mpool.
bosilca Mar 24, 2016
7c58fb1
usnic: specify the cache name
jsquyres Mar 28, 2016
dd5a800
Add missing include
Apr 2, 2016
6de5b41
rcache/udreg: bug fixes
hjelmn Apr 2, 2016
4b7c44f
mpool/hugepage: use statvfs() instead of statfs() when needed.
ggouaillardet Apr 11, 2016
dc47335
rcache: add missing file
hjelmn Apr 14, 2016
4db758c
rcache/base: add missing file to tarball
hjelmn Apr 19, 2016
2d8ed8a
rcache: fix deadlock in multi-threaded environments
hjelmn May 16, 2016
29d3fad
rcache/grdma: silence a warning
ggouaillardet May 20, 2016
2ee589a
rcache/grdma: fix typo in cuda code
hjelmn May 24, 2016
3b70ad0
mpool/hugepage: set mntent API instead of manually parsing /proc/mounts
ggouaillardet Jul 4, 2016
827464c
rcache/udreg: make reference count thread safe
hjelmn Jul 27, 2016
83b02fe
mpool/hugepage mntent intro fallout
hppritcha Aug 2, 2016
333805c
btl: fix rcache rewrite merge error
hjelmn Sep 22, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions ompi/mca/pml/base/pml_base_bsend.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
Expand All @@ -12,6 +13,8 @@
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -55,10 +58,7 @@ extern char *ompi_pml_base_bsend_allocator_name;
/*
* Routine to return pages to sub-allocator as needed
*/
static void* mca_pml_bsend_alloc_segment(
struct mca_mpool_base_module_t* module,
size_t* size_inout,
mca_mpool_base_registration_t** registration)
static void* mca_pml_bsend_alloc_segment(void *ctx, size_t *size_inout)
{
void *addr;
size_t size = *size_inout;
Expand All @@ -70,7 +70,6 @@ static void* mca_pml_bsend_alloc_segment(
addr = mca_pml_bsend_addr;
mca_pml_bsend_addr += size;
*size_inout = size;
if (NULL != registration) *registration = NULL;
return addr;
}

Expand Down Expand Up @@ -232,7 +231,7 @@ int mca_pml_base_bsend_request_start(ompi_request_t* request)

/* allocate a buffer to hold packed message */
sendreq->req_addr = mca_pml_bsend_allocator->alc_alloc(
mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0, NULL);
mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0);
if(NULL == sendreq->req_addr) {
/* release resources when request is freed */
sendreq->req_base.req_pml_complete = true;
Expand Down Expand Up @@ -287,7 +286,7 @@ int mca_pml_base_bsend_request_alloc(ompi_request_t* request)

/* allocate a buffer to hold packed message */
sendreq->req_addr = mca_pml_bsend_allocator->alc_alloc(
mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0, NULL);
mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0);
if(NULL == sendreq->req_addr) {
/* release resources when request is freed */
sendreq->req_base.req_pml_complete = true;
Expand Down Expand Up @@ -321,7 +320,7 @@ void* mca_pml_base_bsend_request_alloc_buf( size_t length )

/* allocate a buffer to hold packed message */
buf = mca_pml_bsend_allocator->alc_alloc(
mca_pml_bsend_allocator, length, 0, NULL);
mca_pml_bsend_allocator, length, 0);
if(NULL == buf) {
/* release resources when request is freed */
OPAL_THREAD_UNLOCK(&mca_pml_bsend_mutex);
Expand Down
16 changes: 6 additions & 10 deletions ompi/mca/pml/ob1/pml_ob1_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,9 @@ mca_pml_base_component_2_0_0_t mca_pml_ob1_component = {
.pmlm_finalize = mca_pml_ob1_component_fini,
};

void *mca_pml_ob1_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration);
void *mca_pml_ob1_seg_alloc (void *ctx, size_t* size);

void mca_pml_ob1_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment );
void mca_pml_ob1_seg_free (void *ctx, void *segment);

static inline int mca_pml_ob1_param_register_int(
const char* param_name,
Expand Down Expand Up @@ -366,13 +363,12 @@ int mca_pml_ob1_component_fini(void)
return OMPI_SUCCESS;
}

void *mca_pml_ob1_seg_alloc( struct mca_mpool_base_module_t* mpool,
size_t* size,
mca_mpool_base_registration_t** registration) {
void *mca_pml_ob1_seg_alloc (void *ctx, size_t *size)
{
return malloc(*size);
}

void mca_pml_ob1_seg_free( struct mca_mpool_base_module_t* mpool,
void* segment ) {
void mca_pml_ob1_seg_free (void *ctx, void *segment)
{
free(segment);
}
2 changes: 1 addition & 1 deletion ompi/mca/pml/ob1/pml_ob1_recvfrag.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ do { \
buffers[0].addr = (char*) \
mca_pml_ob1.allocator->alc_alloc( mca_pml_ob1.allocator, \
buffers[0].len, \
0, NULL); \
0); \
_ptr = (unsigned char*)(buffers[0].addr); \
macro_segments[0].seg_addr.pval = buffers[0].addr; \
} \
Expand Down
8 changes: 4 additions & 4 deletions ompi/mca/vprotocol/base/vprotocol_base_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ int mca_vprotocol_base_request_parasite(void)
pml_fl_save.fl_max_to_alloc,
pml_fl_save.fl_num_per_alloc,
pml_fl_save.fl_mpool,
pml_fl_save.fl_mpool_reg_flags,
0,
pml_fl_save.fl_rcache_reg_flags,
pml_fl_save.fl_rcache,
pml_fl_save.item_init,
pml_fl_save.ctx);
if(OMPI_SUCCESS != ret) return ret;
Expand Down Expand Up @@ -71,8 +71,8 @@ int mca_vprotocol_base_request_parasite(void)
pml_fl_save.fl_max_to_alloc,
pml_fl_save.fl_num_per_alloc,
pml_fl_save.fl_mpool,
pml_fl_save.fl_mpool_reg_flags,
0,
pml_fl_save.fl_rcache_reg_flags,
pml_fl_save.fl_rcache,
pml_fl_save.item_init,
pml_fl_save.ctx);
if(OMPI_SUCCESS != ret) return ret;
Expand Down
16 changes: 15 additions & 1 deletion ompi/mpi/c/alloc_mem.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
Expand All @@ -12,6 +13,8 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -43,6 +46,8 @@ static const char FUNC_NAME[] = "MPI_Alloc_mem";

int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr)
{
char info_value[MPI_MAX_INFO_VAL + 1];
char *mpool_hints = NULL;

if (MPI_PARAM_CHECK) {
OMPI_ERR_INIT_FINALIZE(FUNC_NAME);
Expand All @@ -67,7 +72,16 @@ int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr)
return MPI_SUCCESS;
}

*((void **) baseptr) = mca_mpool_base_alloc((size_t) size, (struct opal_info_t*)info);
if (MPI_INFO_NULL != info) {
int flag;
(void) ompi_info_get (info, "mpool_hints", MPI_MAX_INFO_VAL, info_value, &flag);
if (flag) {
mpool_hints = info_value;
}
}

*((void **) baseptr) = mca_mpool_base_alloc ((size_t) size, (struct opal_info_t*)info,
mpool_hints);
if (NULL == *((void **) baseptr)) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_NO_MEM,
FUNC_NAME);
Expand Down
7 changes: 0 additions & 7 deletions ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -619,13 +619,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)

/* Select which MPI components to use */

if (OMPI_SUCCESS !=
(ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS,
ompi_mpi_thread_multiple))) {
error = "mca_mpool_base_init() failed";
goto error;
}

if (OMPI_SUCCESS !=
(ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS,
ompi_mpi_thread_multiple))) {
Expand Down
97 changes: 58 additions & 39 deletions opal/class/opal_free_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
* Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand All @@ -28,6 +28,9 @@
#include "opal/align.h"
#include "opal/util/output.h"
#include "opal/mca/mpool/mpool.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/rcache/rcache.h"
#include "opal/util/sys_limits.h"

typedef struct opal_free_list_item_t opal_free_list_memory_t;

Expand All @@ -49,17 +52,22 @@ static void opal_free_list_construct(opal_free_list_t* fl)
fl->fl_payload_buffer_alignment = 0;
fl->fl_frag_class = OBJ_CLASS(opal_free_list_item_t);
fl->fl_mpool = NULL;
fl->fl_rcache = NULL;
/* default flags */
fl->fl_mpool_reg_flags = MCA_MPOOL_FLAGS_CACHE_BYPASS |
MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM;
fl->fl_rcache_reg_flags = MCA_RCACHE_FLAGS_CACHE_BYPASS |
MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM;
fl->ctx = NULL;
OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t);
}

static void opal_free_list_allocation_release (opal_free_list_t *fl, opal_free_list_memory_t *fl_mem)
{
if (NULL != fl->fl_rcache) {
fl->fl_rcache->rcache_deregister (fl->fl_rcache, fl_mem->registration);
}

if (NULL != fl->fl_mpool) {
fl->fl_mpool->mpool_free (fl->fl_mpool, fl_mem->ptr, fl_mem->registration);
fl->fl_mpool->mpool_free (fl->fl_mpool, fl_mem->ptr);
} else if (fl_mem->ptr) {
free (fl_mem->ptr);
}
Expand Down Expand Up @@ -108,8 +116,9 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_
opal_class_t *frag_class, size_t payload_buffer_size,
size_t payload_buffer_alignment, int num_elements_to_alloc,
int max_elements_to_alloc, int num_elements_per_alloc,
mca_mpool_base_module_t* mpool, int mpool_reg_flags,
void *unused0, opal_free_list_item_init_fn_t item_init, void *ctx)
mca_mpool_base_module_t *mpool, int rcache_reg_flags,
mca_rcache_base_module_t *rcache, opal_free_list_item_init_fn_t item_init,
void *ctx)
{
/* alignment must be more than zero and power of two */
if (frag_alignment <= 1 || (frag_alignment & (frag_alignment - 1))) {
Expand Down Expand Up @@ -137,11 +146,12 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_
flist->fl_max_to_alloc = max_elements_to_alloc;
flist->fl_num_allocated = 0;
flist->fl_num_per_alloc = num_elements_per_alloc;
flist->fl_mpool = mpool;
flist->fl_mpool = mpool ? mpool : mca_mpool_base_default_module;
flist->fl_rcache = rcache;
flist->fl_frag_alignment = frag_alignment;
flist->fl_payload_buffer_alignment = payload_buffer_alignment;
flist->item_init = item_init;
flist->fl_mpool_reg_flags |= mpool_reg_flags;
flist->fl_rcache_reg_flags |= rcache_reg_flags;
flist->ctx = ctx;

if (num_elements_to_alloc) {
Expand All @@ -153,10 +163,10 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_

int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)
{
unsigned char *ptr, *mpool_alloc_ptr = NULL, *payload_ptr = NULL;
unsigned char *ptr, *payload_ptr = NULL;
opal_free_list_memory_t *alloc_ptr;
size_t alloc_size, head_size, elem_size = 0;
mca_mpool_base_registration_t *reg = NULL;
size_t alloc_size, head_size, elem_size = 0, buffer_size, align;
mca_rcache_base_registration_t *reg = NULL;
int rc = OPAL_SUCCESS;

if (flist->fl_max_to_alloc && (flist->fl_num_allocated + num_elements) >
Expand All @@ -170,6 +180,29 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)

head_size = OPAL_ALIGN(flist->fl_frag_size, flist->fl_frag_alignment, size_t);

/* NTH: calculate allocation alignment first as it might change the number of elements */
if (0 != flist->fl_payload_buffer_size) {
elem_size = OPAL_ALIGN(flist->fl_payload_buffer_size,
flist->fl_payload_buffer_alignment, size_t);

/* elem_size should not be 0 here */
assert (elem_size > 0);

buffer_size = num_elements * elem_size;
align = flist->fl_payload_buffer_alignment;

if (MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM & flist->fl_rcache_reg_flags) {
size_t pagesize = opal_getpagesize ();
/* CUDA cannot handle registering overlapping regions, so make
* sure each region is page sized and page aligned. */
align = OPAL_ALIGN(align, pagesize, size_t);
buffer_size = OPAL_ALIGN(buffer_size, pagesize, size_t);

/* avoid wasting space in the buffer */
num_elements = buffer_size / elem_size;
}
}

/* calculate head allocation size */
alloc_size = num_elements * head_size + sizeof(opal_free_list_memory_t) +
flist->fl_frag_alignment;
Expand All @@ -180,37 +213,23 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)
}

if (0 != flist->fl_payload_buffer_size) {
elem_size = OPAL_ALIGN(flist->fl_payload_buffer_size,
flist->fl_payload_buffer_alignment, size_t);

/* elem_size should not be 0 here */
assert (elem_size > 0);

/* allocate the rest from the mpool (or use memalign/malloc) */
if(flist->fl_mpool != NULL) {
payload_ptr = mpool_alloc_ptr =
(unsigned char *) flist->fl_mpool->mpool_alloc(flist->fl_mpool,
num_elements * elem_size,
flist->fl_payload_buffer_alignment,
flist->fl_mpool_reg_flags, &reg);
} else {
#ifdef HAVE_POSIX_MEMALIGN
posix_memalign ((void **) &mpool_alloc_ptr, flist->fl_payload_buffer_alignment,
num_elements * elem_size);
payload_ptr = mpool_alloc_ptr;
#else
mpool_alloc_ptr = (unsigned char *) malloc (num_elements * elem_size +
flist->fl_payload_buffer_alignment);
payload_ptr = (unsigned char *) OPAL_ALIGN((uintptr_t)mpool_alloc_ptr,
flist->fl_payload_buffer_alignment,
uintptr_t);
#endif
}

if(NULL == mpool_alloc_ptr) {
payload_ptr = (unsigned char *) flist->fl_mpool->mpool_alloc(flist->fl_mpool, buffer_size, align, 0);
if (NULL == payload_ptr) {
free(alloc_ptr);
return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
}

if (flist->fl_rcache) {
rc = flist->fl_rcache->rcache_register (flist->fl_rcache, payload_ptr, num_elements * elem_size,
flist->fl_rcache_reg_flags, MCA_RCACHE_ACCESS_ANY, &reg);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
free (alloc_ptr);
flist->fl_mpool->mpool_free (flist->fl_mpool, payload_ptr);

return rc;
}
}
}

/* make the alloc_ptr a list item, save the chunk in the allocations list,
Expand All @@ -219,7 +238,7 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements)
opal_list_append(&(flist->fl_allocations), (opal_list_item_t*)alloc_ptr);

alloc_ptr->registration = reg;
alloc_ptr->ptr = mpool_alloc_ptr;
alloc_ptr->ptr = payload_ptr;

ptr = (unsigned char*)alloc_ptr + sizeof(opal_free_list_memory_t);
ptr = OPAL_ALIGN_PTR(ptr, flist->fl_frag_alignment, unsigned char*);
Expand Down
Loading