Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v2.x: Fix MPI_FINALIZED_HANG #5217

Merged
merged 3 commits into from
Jun 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions ompi/errhandler/errhandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
Expand Down Expand Up @@ -192,11 +192,22 @@ struct ompi_request_t;
* This macro directly invokes the ompi_mpi_errors_are_fatal_handler()
* when an error occurs because MPI_COMM_WORLD does not exist (because
* we're before MPI_Init() or after MPI_Finalize()).
*
* NOTE: The ompi_mpi_state variable is a volatile that is set
* atomically in ompi_mpi_init() and ompi_mpi_finalize(). The
* appropriate memory barriers are done in those 2 functions such that
* we do not need to do a read memory barrier here (in
* potentially-performance-critical code paths) before reading the
* variable.
*/
#define OMPI_ERR_INIT_FINALIZE(name) \
if( OPAL_UNLIKELY(!ompi_mpi_initialized || ompi_mpi_finalized) ) { \
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
}
#define OMPI_ERR_INIT_FINALIZE(name) \
{ \
int32_t state = ompi_mpi_state; \
if (OPAL_UNLIKELY(state < OMPI_MPI_STATE_INIT_COMPLETED || \
state > OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT)) { \
ompi_mpi_errors_are_fatal_comm_handler(NULL, NULL, name); \
} \
}

/**
* This is the macro to invoke to directly invoke an MPI error
Expand Down
20 changes: 13 additions & 7 deletions ompi/errhandler/errhandler_predefined.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006 University of Houston. All rights reserved.
* Copyright (c) 2008-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC.
Expand Down Expand Up @@ -149,7 +149,7 @@ void ompi_mpi_errors_return_win_handler(struct ompi_win_t **win,

static void out(char *str, char *arg)
{
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
if (ompi_mpi_state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
if (NULL != arg) {
opal_output(0, str, arg);
} else {
Expand Down Expand Up @@ -190,7 +190,9 @@ static void backend_fatal_aggregate(char *type,
const char* usable_prefix = unknown_prefix;
const char* usable_err_msg = unknown_error;

assert(ompi_mpi_initialized && !ompi_mpi_finalized);
int32_t state = ompi_mpi_state;
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);

arg = va_arg(arglist, char*);
va_end(arglist);
Expand Down Expand Up @@ -282,7 +284,9 @@ static void backend_fatal_no_aggregate(char *type,
{
char *arg;

assert(!ompi_mpi_initialized || ompi_mpi_finalized);
int32_t state = ompi_mpi_state;
assert(state < OMPI_MPI_STATE_INIT_COMPLETED ||
state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);

fflush(stdout);
fflush(stderr);
Expand All @@ -291,7 +295,7 @@ static void backend_fatal_no_aggregate(char *type,

/* Per #2152, print out in plain english if something was invoked
before MPI_INIT* or after MPI_FINALIZE */
if (!ompi_mpi_init_started && !ompi_mpi_initialized) {
if (state < OMPI_MPI_STATE_INIT_STARTED) {
if (NULL != arg) {
out("*** The %s() function was called before MPI_INIT was invoked.\n"
"*** This is disallowed by the MPI standard.\n", arg);
Expand All @@ -302,7 +306,7 @@ static void backend_fatal_no_aggregate(char *type,
"*** function was invoked, sorry. :-(\n", NULL);
}
out("*** Your MPI job will now abort.\n", NULL);
} else if (ompi_mpi_finalized) {
} else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
if (NULL != arg) {
out("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
"*** This is disallowed by the MPI standard.\n", arg);
Expand Down Expand Up @@ -377,7 +381,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
{
/* We only want aggregation after MPI_INIT and before
MPI_FINALIZE. */
if (ompi_mpi_initialized && !ompi_mpi_finalized) {
int32_t state = ompi_mpi_state;
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
backend_fatal_aggregate(type, comm, name, error_code, arglist);
} else {
backend_fatal_no_aggregate(type, comm, name, error_code, arglist);
Expand Down
3 changes: 2 additions & 1 deletion ompi/mca/coll/fca/coll_fca_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -159,7 +160,7 @@ int mca_coll_fca_barrier(struct ompi_communicator_t *comm,
int ret;

FCA_VERBOSE(5,"Using FCA Barrier");
if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
FCA_VERBOSE(5, "In finalize, reverting to previous barrier");
goto orig_barrier;
}
Expand Down
20 changes: 12 additions & 8 deletions ompi/mca/coll/hcoll/coll_hcoll_module.c
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
/**
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
Copyright (c) 2016 IBM Corporation. All rights reserved.
$COPYRIGHT$

Additional copyrights may follow

$HEADER$
* Copyright (c) 2011 Mellanox Technologies. All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved.
* Copyright (c) 2017 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#include "ompi_config.h"
Expand Down Expand Up @@ -238,7 +242,7 @@ static int mca_coll_hcoll_module_enable(mca_coll_base_module_t *module,

int mca_coll_hcoll_progress(void)
{
if (ompi_mpi_finalized){
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
hcoll_rte_p2p_disabled_notify();
}

Expand Down
3 changes: 2 additions & 1 deletion ompi/mca/coll/hcoll/coll_hcoll_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Copyright (c) 2011 Mellanox Technologies. All rights reserved.
Copyright (c) 2015 Research Organization for Information Science
and Technology (RIST). All rights reserved.
Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
$COPYRIGHT$

Additional copyrights may follow
Expand All @@ -21,7 +22,7 @@ int mca_coll_hcoll_barrier(struct ompi_communicator_t *comm,
mca_coll_hcoll_module_t *hcoll_module = (mca_coll_hcoll_module_t*)module;
HCOL_VERBOSE(20,"RUNNING HCOL BARRIER");

if (OPAL_UNLIKELY(ompi_mpi_finalize_started)) {
if (OPAL_UNLIKELY(ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED)) {
HCOL_VERBOSE(5, "In finalize, reverting to previous barrier");
goto orig_barrier;
}
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/io/romio314/src/io_romio314_file_open.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -58,7 +60,7 @@ mca_io_romio314_file_close (ompi_file_t *fh)
which we obviously can't do if we've started to MPI_Finalize).
The user didn't close the file, so they should expect
unexpected behavior. */
if (ompi_mpi_finalized) {
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return OMPI_SUCCESS;
}

Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/pml/yalla/pml_yalla.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -268,8 +269,8 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
{
size_t i;

if (ompi_mpi_finalized) {
PML_YALLA_VERBOSE(3, "using bulk powerdown");
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
mxm_ep_powerdown(ompi_pml_yalla.mxm_ep);
}

Expand Down
24 changes: 10 additions & 14 deletions ompi/mpi/c/finalized.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -40,13 +41,7 @@ int MPI_Finalized(int *flag)
{
MPI_Comm null = NULL;

/* We must obtain the lock to guarnatee consistent values of
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
this lock is held for the bulk of the duration of
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
lock, we are guaranteed that some other thread is not part way
through initialization or finalization. */
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
int32_t state = ompi_mpi_state;

if (MPI_PARAM_CHECK) {
if (NULL == flag) {
Expand All @@ -55,20 +50,21 @@ int MPI_Finalized(int *flag)
whether we're currently (after MPI_Init and before
MPI_Finalize) or not */

if (ompi_mpi_initialized && !ompi_mpi_finalized) {
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
FUNC_NAME);
} else {
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
return OMPI_ERRHANDLER_INVOKE(null, MPI_ERR_ARG,
/* We have no MPI object here so call ompi_errhandle_invoke
* directly */
return ompi_errhandler_invoke(NULL, NULL, -1,
ompi_errcode_get_mpi_code(MPI_ERR_ARG),
FUNC_NAME);
}
}
}

*flag = ompi_mpi_finalized;
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
*flag = (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT);

return MPI_SUCCESS;
}
6 changes: 4 additions & 2 deletions ompi/mpi/c/get_library_version.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -56,7 +56,9 @@ int MPI_Get_library_version(char *version, int *resultlen)
(i.e., use a NULL communicator, which will end up at the
default errhandler, which is abort). */

if (ompi_mpi_initialized && !ompi_mpi_finalized) {
int32_t state = ompi_mpi_state;
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
FUNC_NAME);
} else {
Expand Down
6 changes: 5 additions & 1 deletion ompi/mpi/c/get_version.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -53,7 +55,9 @@ int MPI_Get_version(int *version, int *subversion)
(i.e., use a NULL communicator, which will end up at the
default errhandler, which is abort). */

if (ompi_mpi_initialized && !ompi_mpi_finalized) {
int32_t state = ompi_mpi_state;
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
FUNC_NAME);
} else {
Expand Down
6 changes: 3 additions & 3 deletions ompi/mpi/c/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -63,9 +63,9 @@ int MPI_Init(int *argc, char ***argv)
don't lose anything) */

if (NULL != argc && NULL != argv) {
err = ompi_mpi_init(*argc, *argv, required, &provided);
err = ompi_mpi_init(*argc, *argv, required, &provided, false);
} else {
err = ompi_mpi_init(0, NULL, required, &provided);
err = ompi_mpi_init(0, NULL, required, &provided, false);
}

/* Since we don't have a communicator to invoke an errorhandler on
Expand Down
8 changes: 5 additions & 3 deletions ompi/mpi/c/init_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
* Copyright (c) 2010 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -66,9 +68,9 @@ int MPI_Init_thread(int *argc, char ***argv, int required,
don't lose anything) */

if (NULL != argc && NULL != argv) {
err = ompi_mpi_init(*argc, *argv, required, provided);
err = ompi_mpi_init(*argc, *argv, required, provided, false);
} else {
err = ompi_mpi_init(0, NULL, required, provided);
err = ompi_mpi_init(0, NULL, required, provided, false);
}

/* Since we don't have a communicator to invoke an errorhandler on
Expand Down
24 changes: 10 additions & 14 deletions ompi/mpi/c/initialized.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
* All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015 Intel, Inc. All rights reserved
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -40,13 +41,7 @@ int MPI_Initialized(int *flag)
{
MPI_Comm null = NULL;

/* We must obtain the lock to guarnatee consistent values of
ompi_mpi_initialized and ompi_mpi_finalized. Note, too, that
this lock is held for the bulk of the duration of
ompi_mpi_init() and ompi_mpi_finalize(), so when we get the
lock, we are guaranteed that some other thread is not part way
through initialization or finalization. */
opal_mutex_lock(&ompi_mpi_bootstrap_mutex);
int32_t state = ompi_mpi_state;

if (MPI_PARAM_CHECK) {
if (NULL == flag) {
Expand All @@ -55,20 +50,21 @@ int MPI_Initialized(int *flag)
whether we're currently (after MPI_Init and before
MPI_Finalize) or not */

if (ompi_mpi_initialized && !ompi_mpi_finalized) {
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
if (state >= OMPI_MPI_STATE_INIT_COMPLETED &&
state < OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT) {
return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
FUNC_NAME);
} else {
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
return OMPI_ERRHANDLER_INVOKE(null, MPI_ERR_ARG,
/* We have no MPI object here so call ompi_errhandle_invoke
* directly */
return ompi_errhandler_invoke(NULL, NULL, -1,
ompi_errcode_get_mpi_code(MPI_ERR_ARG),
FUNC_NAME);
}
}
}

*flag = ompi_mpi_initialized;
opal_mutex_unlock(&ompi_mpi_bootstrap_mutex);
*flag = (state >= OMPI_MPI_STATE_INIT_COMPLETED);

return MPI_SUCCESS;
}
Loading