From fd088cfb933c6db31fd13a30e3f91de29a4b6720 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 9 Feb 2024 19:39:14 -0700 Subject: [PATCH] Fix daemon suicide and preserve output files Correctly set parent rank so that the OOB can correctly identify its lifeline and cause the daemon to abort when it dies. Fix the `--debug-daemons-file` flag so it works, and preserve the resulting output file from cleanup. Signed-off-by: Ralph Castain (cherry picked from commit a87d17257225b430afaaa223ccae2bd90fce0d61) --- src/mca/ess/base/ess_base_std_prted.c | 6 ------ src/rml/rml.c | 5 +++-- src/tools/prte/prte.c | 3 +++ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/mca/ess/base/ess_base_std_prted.c b/src/mca/ess/base/ess_base_std_prted.c index f183064b4a..970df70415 100644 --- a/src/mca/ess/base/ess_base_std_prted.c +++ b/src/mca/ess/base/ess_base_std_prted.c @@ -539,12 +539,6 @@ int prte_ess_base_prted_finalize(void) signals_set = false; } - /* cleanup */ - if (NULL != log_path) { - unlink(log_path); - } - - if (NULL != prte_errmgr.finalize) { prte_errmgr.finalize(); } diff --git a/src/rml/rml.c b/src/rml/rml.c index d90d07fe3b..f8be31be90 100644 --- a/src/rml/rml.c +++ b/src/rml/rml.c @@ -8,7 +8,7 @@ * Copyright (c) 2014-2019 Intel, Inc. All rights reserved. * Copyright (c) 2015-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -100,11 +100,12 @@ void prte_rml_open(void) PMIX_CONSTRUCT(&prte_rml_base.posted_recvs, pmix_list_t); PMIX_CONSTRUCT(&prte_rml_base.unmatched_msgs, pmix_list_t); PMIX_CONSTRUCT(&prte_rml_base.children, pmix_list_t); - prte_rml_base.lifeline = PRTE_PROC_MY_PARENT->rank; /* compute the routing tree - only thing we need to know is the * number of daemons in the DVM */ prte_rml_compute_routing_tree(); + + prte_rml_base.lifeline = PRTE_PROC_MY_PARENT->rank; } void prte_rml_send_callback(int status, pmix_proc_t *peer, diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index f48ebd14f5..389c49a950 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -534,6 +534,9 @@ int main(int argc, char *argv[]) if (pmix_cmd_line_is_taken(&results, PRTE_CLI_DEBUG_DAEMONS)) { prte_debug_daemons_flag = true; } + if (pmix_cmd_line_is_taken(&results, PRTE_CLI_DEBUG_DAEMONS_FILE)) { + prte_debug_daemons_file_flag = true; + } if (pmix_cmd_line_is_taken(&results, PRTE_CLI_LEAVE_SESSION_ATTACHED)) { prte_leave_session_attached = true; }