Skip to content

Commit

Permalink
osc/pt2pt: do not repost receive from request callback
Browse files Browse the repository at this point in the history
This commit fixes an issue that can occur if a target gets overwhelmed with
requests. This can cause osc/pt2pt to go into deep recursion with a stack
like req_complete_cb -> ompi_osc_pt2pt_callback -> start -> req_complete_cb
-> ... . At small scale this is fine as the recursion depth stays small but
at larger scale we can quickly exhaust the stack processing frag requests.
To fix the issue the request callback now simply puts the request on a
list and returns. The osc/pt2pt progress function then handles the
processing and reposting of the request.

As part of this change osc/pt2pt can now post multiple fragment receive
requests per window. This should help prevent a target from being overwhelmed.

Signed-off-by: Nathan Hjelm <hjelmn@me.com>
  • Loading branch information
hjelmn committed Aug 11, 2016
1 parent 8d0baf1 commit 7589a25
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 66 deletions.
26 changes: 24 additions & 2 deletions ompi/mca/osc/pt2pt/osc_pt2pt.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
BEGIN_C_DECLS

struct ompi_osc_pt2pt_frag_t;
struct ompi_osc_pt2pt_receive_t;

struct ompi_osc_pt2pt_component_t {
/** Extend the basic osc component interface */
Expand All @@ -61,6 +62,9 @@ struct ompi_osc_pt2pt_component_t {
/** module count */
int module_count;

/** number of buffers per window */
int receive_count;

/** free list of ompi_osc_pt2pt_frag_t structures */
opal_free_list_t frags;

Expand All @@ -76,6 +80,12 @@ struct ompi_osc_pt2pt_component_t {
/** List of operations that need to be processed */
opal_list_t pending_operations;

/** List of receives to be processed */
opal_list_t pending_receives;

/** Lock for pending_receives */
opal_mutex_t pending_receives_lock;

/** Is the progress function enabled? */
bool progress_enable;
};
Expand Down Expand Up @@ -192,8 +202,11 @@ struct ompi_osc_pt2pt_module_t {
/** origin side list of locks currently outstanding */
opal_hash_table_t outstanding_locks;

unsigned char *incoming_buffer;
ompi_request_t *frag_request;
/** receive fragments */
struct ompi_osc_pt2pt_receive_t *recv_frags;

/** number of receive fragments */
int recv_frag_count;

/* enforce accumulate semantics */
opal_atomic_lock_t accumulate_lock;
Expand Down Expand Up @@ -243,6 +256,15 @@ struct ompi_osc_pt2pt_pending_t {
typedef struct ompi_osc_pt2pt_pending_t ompi_osc_pt2pt_pending_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_pending_t);

struct ompi_osc_pt2pt_receive_t {
opal_list_item_t super;
ompi_osc_pt2pt_module_t *module;
ompi_request_t *pml_request;
void *buffer;
};
typedef struct ompi_osc_pt2pt_receive_t ompi_osc_pt2pt_receive_t;
OBJ_CLASS_DECLARATION(ompi_osc_pt2pt_receive_t);

#define GET_MODULE(win) ((ompi_osc_pt2pt_module_t*) win->w_osc_module)

extern bool ompi_osc_pt2pt_no_locks;
Expand Down
101 changes: 69 additions & 32 deletions ompi/mca/osc/pt2pt/osc_pt2pt_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,44 +142,62 @@ static int component_register (void)
NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&mca_osc_pt2pt_component.buffer_size);

mca_osc_pt2pt_component.receive_count = 4;
(void) mca_base_component_var_register (&mca_osc_pt2pt_component.super.osc_version, "receive_count",
"Number of receives to post for each window for incoming fragments "
"(default: 4)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_pt2pt_component.receive_count);

return OMPI_SUCCESS;
}

static int component_progress (void)
{
int count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
int pending_count = opal_list_get_size (&mca_osc_pt2pt_component.pending_operations);
int recv_count = opal_list_get_size (&mca_osc_pt2pt_component.pending_receives);
ompi_osc_pt2pt_pending_t *pending, *next;

if (0 == count) {
return 0;
if (recv_count) {
for (int i = 0 ; i < recv_count ; ++i) {
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_receives_lock);
ompi_osc_pt2pt_receive_t *recv = (ompi_osc_pt2pt_receive_t *) opal_list_remove_first (&mca_osc_pt2pt_component.pending_receives);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_receives_lock);
if (NULL == recv) {
break;
}

(void) ompi_osc_pt2pt_process_receive (recv);
}
}

/* process one incoming request */
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
int ret;

switch (pending->header.base.type) {
case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
&pending->header.flush);
break;
case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
&pending->header.unlock);
break;
default:
/* shouldn't happen */
assert (0);
abort ();
}

if (OMPI_SUCCESS == ret) {
opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
OBJ_RELEASE(pending);
}
if (pending_count) {
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_operations_lock);
OPAL_LIST_FOREACH_SAFE(pending, next, &mca_osc_pt2pt_component.pending_operations, ompi_osc_pt2pt_pending_t) {
int ret;

switch (pending->header.base.type) {
case OMPI_OSC_PT2PT_HDR_TYPE_FLUSH_REQ:
ret = ompi_osc_pt2pt_process_flush (pending->module, pending->source,
&pending->header.flush);
break;
case OMPI_OSC_PT2PT_HDR_TYPE_UNLOCK_REQ:
ret = ompi_osc_pt2pt_process_unlock (pending->module, pending->source,
&pending->header.unlock);
break;
default:
/* shouldn't happen */
assert (0);
abort ();
}

if (OMPI_SUCCESS == ret) {
opal_list_remove_item (&mca_osc_pt2pt_component.pending_operations, &pending->super);
OBJ_RELEASE(pending);
}
}
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);
}
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_operations_lock);

return 1;
}
Expand All @@ -193,6 +211,8 @@ component_init(bool enable_progress_threads,
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_operations_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_receives, opal_list_t);
OBJ_CONSTRUCT(&mca_osc_pt2pt_component.pending_receives_lock, opal_mutex_t);

OBJ_CONSTRUCT(&mca_osc_pt2pt_component.modules,
opal_hash_table_t);
Expand Down Expand Up @@ -253,6 +273,8 @@ component_finalize(void)
OBJ_DESTRUCT(&mca_osc_pt2pt_component.requests);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_operations_lock);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_receives);
OBJ_DESTRUCT(&mca_osc_pt2pt_component.pending_receives_lock);

return OMPI_SUCCESS;
}
Expand Down Expand Up @@ -385,11 +407,6 @@ component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit
/* sync memory - make sure all initialization completed */
opal_atomic_mb();

module->incoming_buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
if (OPAL_UNLIKELY(NULL == module->incoming_buffer)) {
goto cleanup;
}

ret = ompi_osc_pt2pt_frag_start_receive (module);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
goto cleanup;
Expand Down Expand Up @@ -449,6 +466,26 @@ ompi_osc_pt2pt_get_info(struct ompi_win_t *win, struct ompi_info_t **info_used)

OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_pending_t, opal_list_item_t, NULL, NULL);

void ompi_osc_pt2pt_receive_construct (ompi_osc_pt2pt_receive_t *recv)
{
recv->buffer = NULL;
recv->pml_request = NULL;
}

void ompi_osc_pt2pt_receive_destruct (ompi_osc_pt2pt_receive_t *recv)
{
free (recv->buffer);
if (recv->pml_request && MPI_REQUEST_NULL != recv->pml_request) {
recv->pml_request->req_complete_cb = NULL;
ompi_request_cancel (recv->pml_request);
ompi_request_free (&recv->pml_request);
}
}

OBJ_CLASS_INSTANCE(ompi_osc_pt2pt_receive_t, opal_list_item_t,
ompi_osc_pt2pt_receive_construct,
ompi_osc_pt2pt_receive_destruct);

static void ompi_osc_pt2pt_peer_construct (ompi_osc_pt2pt_peer_t *peer)
{
OBJ_CONSTRUCT(&peer->queued_frags, opal_list_t);
Expand Down
77 changes: 63 additions & 14 deletions ompi/mca/osc/pt2pt/osc_pt2pt_data_move.c
Original file line number Diff line number Diff line change
Expand Up @@ -1616,11 +1616,38 @@ static inline int process_frag (ompi_osc_pt2pt_module_t *module,
/* dispatch for callback on message completion */
static int ompi_osc_pt2pt_callback (ompi_request_t *request)
{
ompi_osc_pt2pt_module_t *module = (ompi_osc_pt2pt_module_t *) request->req_complete_cb_data;
ompi_osc_pt2pt_header_t *base_header =
(ompi_osc_pt2pt_header_t *) module->incoming_buffer;
size_t incoming_length = request->req_status._ucount;
int source = request->req_status.MPI_SOURCE;
ompi_osc_pt2pt_receive_t *recv = (ompi_osc_pt2pt_receive_t *) request->req_complete_cb_data;

OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "received pt2pt fragment"));

/* to avoid deep recursion from complet -> start -> complete -> ... we simply put this
* request on a list and let it be processed by opal_progress(). */
OPAL_THREAD_LOCK(&mca_osc_pt2pt_component.pending_receives_lock);
opal_list_append (&mca_osc_pt2pt_component.pending_receives, &recv->super);
OPAL_THREAD_UNLOCK(&mca_osc_pt2pt_component.pending_receives_lock);

return OMPI_SUCCESS;
}

static int ompi_osc_pt2pt_receive_repost (ompi_osc_pt2pt_receive_t *recv)
{
/* wait until the request has been marked as complete */
ompi_request_wait_completion (recv->pml_request);

/* ompi_request_complete clears the callback */
recv->pml_request->req_complete_cb = ompi_osc_pt2pt_callback;
recv->pml_request->req_complete_cb_data = (void *) recv;

return MCA_PML_CALL(start(1, &recv->pml_request));
}

int ompi_osc_pt2pt_process_receive (ompi_osc_pt2pt_receive_t *recv)
{
ompi_osc_pt2pt_module_t *module = (ompi_osc_pt2pt_module_t *) recv->module;
ompi_osc_pt2pt_header_t *base_header = (ompi_osc_pt2pt_header_t *) recv->buffer;
size_t incoming_length = recv->pml_request->req_status._ucount;
int source = recv->pml_request->req_status.MPI_SOURCE;
int rc;

assert(incoming_length >= sizeof(ompi_osc_pt2pt_header_base_t));
(void)incoming_length; // silence compiler warning
Expand Down Expand Up @@ -1661,23 +1688,45 @@ static int ompi_osc_pt2pt_callback (ompi_request_t *request)

osc_pt2pt_gc_clean (module);

ompi_osc_pt2pt_frag_start_receive (module);

/* put this request on the garbage colletion list */
osc_pt2pt_gc_add_request (module, request);
rc = ompi_osc_pt2pt_receive_repost (recv);

OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output,
"finished posting receive request"));
"finished posting receive request. rc: %d", rc));

return OMPI_SUCCESS;
}

int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module)
{
module->frag_request = MPI_REQUEST_NULL;
return ompi_osc_pt2pt_irecv_w_cb (module->incoming_buffer, mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t),
MPI_BYTE, OMPI_ANY_SOURCE, OSC_PT2PT_FRAG_TAG, module->comm, &module->frag_request,
ompi_osc_pt2pt_callback, module);
int rc;

module->recv_frag_count = mca_osc_pt2pt_component.receive_count;
if (module->recv_frag_count < 0) {
module->recv_frag_count = 1;
}

module->recv_frags = malloc (sizeof (module->recv_frags[0]) * module->recv_frag_count);
if (NULL == module->recv_frags) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

for (int i = 0 ; i < module->recv_frag_count ; ++i) {
OBJ_CONSTRUCT(module->recv_frags + i, ompi_osc_pt2pt_receive_t);
module->recv_frags[i].module = module;
module->recv_frags[i].buffer = malloc (mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t));
if (NULL == module->recv_frags[i].buffer) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

rc = ompi_osc_pt2pt_irecv_w_cb (module->recv_frags[i].buffer, mca_osc_pt2pt_component.buffer_size + sizeof (ompi_osc_pt2pt_frag_header_t),
MPI_BYTE, OMPI_ANY_SOURCE, OSC_PT2PT_FRAG_TAG, module->comm, &module->recv_frags[i].pml_request,
ompi_osc_pt2pt_callback, module->recv_frags + i);
if (OMPI_SUCCESS != rc) {
return rc;
}
}

return OMPI_SUCCESS;
}

int ompi_osc_pt2pt_component_irecv (ompi_osc_pt2pt_module_t *module, void *buf,
Expand Down
12 changes: 12 additions & 0 deletions ompi/mca/osc/pt2pt/osc_pt2pt_data_move.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,16 @@ void ompi_osc_pt2pt_process_flush_ack (ompi_osc_pt2pt_module_t *module, int sour
*/
int ompi_osc_pt2pt_frag_start_receive (ompi_osc_pt2pt_module_t *module);

/**
* ompi_osc_pt2pt_process_receive:
*
* @short Report a receive request
*
* @param[in] recv - Receive structure
*
* @long This function reposts a receive request. This function should not be called from
* a pml request callback as it can lead to deep recursion during heavy load.
*/
int ompi_osc_pt2pt_process_receive (ompi_osc_pt2pt_receive_t *recv);

#endif
14 changes: 8 additions & 6 deletions ompi/mca/osc/pt2pt/osc_pt2pt_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,20 @@ int ompi_osc_pt2pt_free(ompi_win_t *win)
OBJ_DESTRUCT(&module->peer_hash);
OBJ_DESTRUCT(&module->peer_lock);

if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);
if (NULL != module->recv_frags) {
for (int i = 0 ; i < module->recv_frag_count ; ++i) {
OBJ_DESTRUCT(module->recv_frags + i);
}

if (NULL != module->frag_request && MPI_REQUEST_NULL != module->frag_request) {
module->frag_request->req_complete_cb = NULL;
ompi_request_cancel (module->frag_request);
ompi_request_free (&module->frag_request);
free (module->recv_frags);
}

if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count);

if (NULL != module->comm) {
ompi_comm_free(&module->comm);
}
if (NULL != module->incoming_buffer) free (module->incoming_buffer);

if (NULL != module->free_after) free(module->free_after);

free (module);
Expand Down
4 changes: 0 additions & 4 deletions ompi/mca/pml/cm/pml_cm_start.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@ mca_pml_cm_start(size_t count, ompi_request_t** requests)
continue;
}

if (OMPI_REQUEST_ACTIVE == pml_request->req_ompi.req_state) {
return OMPI_ERR_REQUEST;
}

/* start the request */
switch (pml_request->req_pml_type) {
case MCA_PML_CM_REQUEST_SEND_HEAVY:
Expand Down
4 changes: 0 additions & 4 deletions ompi/mca/pml/ob1/pml_ob1_start.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ int mca_pml_ob1_start(size_t count, ompi_request_t** requests)
opal_atomic_rmb();
#endif

if (OMPI_REQUEST_ACTIVE == pml_request->req_ompi.req_state) {
return OMPI_ERR_REQUEST;
}

/* start the request */
switch(pml_request->req_type) {
case MCA_PML_REQUEST_SEND:
Expand Down
Loading

0 comments on commit 7589a25

Please sign in to comment.