diff --git a/src/mca/errmgr/base/errmgr_base_fns.c b/src/mca/errmgr/base/errmgr_base_fns.c index c87d890857..9830276c49 100644 --- a/src/mca/errmgr/base/errmgr_base_fns.c +++ b/src/mca/errmgr/base/errmgr_base_fns.c @@ -17,7 +17,7 @@ * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -92,48 +92,3 @@ void prte_errmgr_base_log(int error_code, char *filename, int line) pmix_output(0, "%s PRTE_ERROR_LOG: %s in file %s at line %d", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), errstring, filename, line); } - -void prte_errmgr_base_abort(int error_code, char *fmt, ...) -{ - va_list arglist; - - /* If there was a message, output it */ - va_start(arglist, fmt); - if (NULL != fmt) { - char *buffer = NULL; - pmix_vasprintf(&buffer, fmt, arglist); - pmix_output(0, "%s", buffer); - free(buffer); - } - va_end(arglist); - - /* if I am a daemon or the HNP... */ - if (PRTE_PROC_IS_MASTER || PRTE_PROC_IS_DAEMON) { - /* whack my local procs */ - if (NULL != prte_odls.kill_local_procs) { - prte_odls.kill_local_procs(NULL); - } - /* whack any session directories */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); - } - - /* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core - */ - if (PRTE_ERR_CONNECTION_FAILED == error_code || PRTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) { - prte_ess.abort(error_code, false); - } else { - prte_ess.abort(error_code, true); - } - - /* - * We must exit in prte_ess.abort; all implementations of prte_ess.abort - * contain __prte_attribute_noreturn__ - */ - /* No way to reach here */ -} - -int prte_errmgr_base_abort_peers(pmix_proc_t *procs, int32_t num_procs, int error_code) -{ - PRTE_HIDE_UNUSED_PARAMS(procs, num_procs, error_code); - return PRTE_ERR_NOT_IMPLEMENTED; -} diff --git a/src/mca/errmgr/base/errmgr_base_frame.c b/src/mca/errmgr/base/errmgr_base_frame.c index 045db9cb0e..b95205b5a5 100644 --- a/src/mca/errmgr/base/errmgr_base_frame.c +++ b/src/mca/errmgr/base/errmgr_base_frame.c @@ -16,7 +16,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,27 +48,21 @@ #include "src/mca/errmgr/base/static-components.h" -/* - * Globals - */ -prte_errmgr_base_t prte_errmgr_base = { - .error_cbacks = PMIX_LIST_STATIC_INIT -}; - /* Public module provides a wrapper around previous functions */ -prte_errmgr_base_module_t prte_errmgr_default_fns = {.init = NULL, /* init */ - .finalize = NULL, /* finalize */ - .logfn = prte_errmgr_base_log, - .abort = prte_errmgr_base_abort, - .abort_peers = prte_errmgr_base_abort_peers, - .enable_detector = NULL}; +prte_errmgr_base_module_t prte_errmgr_default_fns = { + .init = NULL, /* init */ + .finalize = NULL, /* finalize */ + .logfn = prte_errmgr_base_log +}; /* NOTE: ABSOLUTELY MUST initialize this * struct to include the log function as it * gets called even if the errmgr hasn't been * opened yet due to error */ -prte_errmgr_base_module_t prte_errmgr = {.logfn = prte_errmgr_base_log}; +prte_errmgr_base_module_t prte_errmgr = { + .logfn = prte_errmgr_base_log +}; static int prte_errmgr_base_close(void) { @@ -80,9 +74,6 @@ static int prte_errmgr_base_close(void) /* always leave a default set of fn pointers */ prte_errmgr = prte_errmgr_default_fns; - /* destruct the callback list */ - PMIX_LIST_DESTRUCT(&prte_errmgr_base.error_cbacks); - return pmix_mca_base_framework_components_close(&prte_errmgr_base_framework, NULL); } @@ -95,9 +86,6 @@ static int prte_errmgr_base_open(pmix_mca_base_open_flag_t flags) /* load the default fns */ prte_errmgr = prte_errmgr_default_fns; - /* initialize the error callback list */ - PMIX_CONSTRUCT(&prte_errmgr_base.error_cbacks, pmix_list_t); - /* Open up all available components */ return pmix_mca_base_framework_components_open(&prte_errmgr_base_framework, flags); } diff --git a/src/mca/errmgr/base/errmgr_private.h b/src/mca/errmgr/base/errmgr_private.h index cb1f0848f5..4e480e330a 100644 --- a/src/mca/errmgr/base/errmgr_private.h +++ b/src/mca/errmgr/base/errmgr_private.h @@ -14,7 +14,7 @@ * All rights reserved. * Copyright (c) 2017-2020 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -48,13 +48,6 @@ */ BEGIN_C_DECLS -/* define a struct to hold framework-global values */ -typedef struct { - pmix_list_t error_cbacks; -} prte_errmgr_base_t; - -PRTE_EXPORT extern prte_errmgr_base_t prte_errmgr_base; - /* declare the base default module */ PRTE_EXPORT extern prte_errmgr_base_module_t prte_errmgr_default_fns; @@ -63,9 +56,5 @@ PRTE_EXPORT extern prte_errmgr_base_module_t prte_errmgr_default_fns; */ PRTE_EXPORT void prte_errmgr_base_log(int error_code, char *filename, int line); -PRTE_EXPORT void prte_errmgr_base_abort(int error_code, char *fmt, ...) - __prte_attribute_format__(__printf__, 2, 3); -PRTE_EXPORT int prte_errmgr_base_abort_peers(pmix_proc_t *procs, int32_t num_procs, int error_code); - END_C_DECLS #endif diff --git a/src/mca/errmgr/dvm/errmgr_dvm.c b/src/mca/errmgr/dvm/errmgr_dvm.c index 5f7c9aead0..8c8cea4712 100644 --- a/src/mca/errmgr/dvm/errmgr_dvm.c +++ b/src/mca/errmgr/dvm/errmgr_dvm.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -71,9 +71,7 @@ static int finalize(void); prte_errmgr_base_module_t prte_errmgr_dvm_module = { .init = init, .finalize = finalize, - .logfn = prte_errmgr_base_log, - .abort = prte_errmgr_base_abort, - .abort_peers = prte_errmgr_base_abort_peers + .logfn = prte_errmgr_base_log }; /* diff --git a/src/mca/errmgr/errmgr.h b/src/mca/errmgr/errmgr.h index 1b36dedb23..10edb084c8 100644 --- a/src/mca/errmgr/errmgr.h +++ b/src/mca/errmgr/errmgr.h @@ -16,7 +16,7 @@ * reserved. * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -99,27 +99,6 @@ typedef int (*prte_errmgr_base_module_finalize_fn_t)(void); */ typedef void (*prte_errmgr_base_module_log_fn_t)(int error_code, char *filename, int line); -/** - * Alert - self aborting - * This function is called when a process is aborting due to some internal error. - * It will finalize the process - * itself, and then exit - it takes no other actions. The intent here is to provide - * a last-ditch exit procedure that attempts to clean up a little. - */ -typedef void (*prte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) - __prte_attribute_format_funcptr__(__printf__, 2, 3); - -/** - * Alert - abort peers - * This function is called when a process wants to abort one or more peer processes. - * For example, MPI_Abort(comm) will use this function to terminate peers in the - * communicator group before aborting itself. - */ -typedef int (*prte_errmgr_base_module_abort_peers_fn_t)(pmix_proc_t *procs, int32_t num_procs, - int error_code); - -typedef void (*prte_errmgr_base_module_enable_detector_fn_t)(bool flag); - /* * Module Structure */ @@ -130,11 +109,6 @@ struct prte_errmgr_base_module_2_3_0_t { prte_errmgr_base_module_finalize_fn_t finalize; prte_errmgr_base_module_log_fn_t logfn; - prte_errmgr_base_module_abort_fn_t abort; - prte_errmgr_base_module_abort_peers_fn_t abort_peers; - - /* start error detector and propagator */ - prte_errmgr_base_module_enable_detector_fn_t enable_detector; }; typedef struct prte_errmgr_base_module_2_3_0_t prte_errmgr_base_module_2_3_0_t; typedef prte_errmgr_base_module_2_3_0_t prte_errmgr_base_module_t; diff --git a/src/mca/errmgr/prted/errmgr_prted.c b/src/mca/errmgr/prted/errmgr_prted.c index ca228a8e4e..749527812e 100644 --- a/src/mca/errmgr/prted/errmgr_prted.c +++ b/src/mca/errmgr/prted/errmgr_prted.c @@ -10,7 +10,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -64,12 +64,11 @@ static void prted_abort(int error_code, char *fmt, ...); /****************** * prted module ******************/ -prte_errmgr_base_module_t prte_errmgr_prted_module = {.init = init, - .finalize = finalize, - .logfn = prte_errmgr_base_log, - .abort = prted_abort, - .abort_peers = prte_errmgr_base_abort_peers, - .enable_detector = NULL}; +prte_errmgr_base_module_t prte_errmgr_prted_module = { + .init = init, + .finalize = finalize, + .logfn = prte_errmgr_base_log +}; /* Local functions */ static bool any_live_children(pmix_nspace_t job); @@ -671,8 +670,8 @@ static void proc_errors(int fd, short args, void *cbdata) /* remove all of this job's children from the global list */ for (i = 0; i < prte_local_children->size; i++) { - if (NULL - == (ptr = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i))) { + ptr = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i); + if (NULL == ptr) { continue; } if (PMIX_CHECK_NSPACE(jdata->nspace, ptr->name.nspace)) { @@ -681,9 +680,6 @@ static void proc_errors(int fd, short args, void *cbdata) } } - /* ensure the job's local session directory tree is removed */ - prte_session_dir_cleanup(jdata->nspace); - /* remove this job from our local job data since it is complete */ PMIX_RELEASE(jdata); diff --git a/src/mca/ess/alps/ess_alps_module.c b/src/mca/ess/alps/ess_alps_module.c index 95b2d8c7fa..8fb8cde671 100644 --- a/src/mca/ess/alps/ess_alps_module.c +++ b/src/mca/ess/alps/ess_alps_module.c @@ -15,7 +15,7 @@ * Copyright (c) 2017-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,9 +46,10 @@ static int alps_set_name(void); static int rte_init(int argc, char **argv); static int rte_finalize(void); -prte_ess_base_module_t prte_ess_alps_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = NULL}; +prte_ess_base_module_t prte_ess_alps_module = { + .init = rte_init, + .finalize = rte_finalize +}; /* Local variables */ static pmix_rank_t starting_vpid = 0; diff --git a/src/mca/ess/base/ess_base_frame.c b/src/mca/ess/base/ess_base_frame.c index d0a99a0fe5..6ebd47035e 100644 --- a/src/mca/ess/base/ess_base_frame.c +++ b/src/mca/ess/base/ess_base_frame.c @@ -14,7 +14,7 @@ * Copyright (c) 2017-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -47,7 +47,6 @@ prte_ess_base_module_t prte_ess = { .init = NULL, .finalize = NULL, - .abort = NULL, }; int prte_ess_base_num_procs = -1; char *prte_ess_base_nspace = NULL; diff --git a/src/mca/ess/base/ess_base_std_prted.c b/src/mca/ess/base/ess_base_std_prted.c index 970df70415..4e48cce7d6 100644 --- a/src/mca/ess/base/ess_base_std_prted.c +++ b/src/mca/ess/base/ess_base_std_prted.c @@ -18,7 +18,7 @@ * Copyright (c) 2017 IBM Corporation. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -100,6 +100,7 @@ int prte_ess_base_prted_setup(void) char log_file[PATH_MAX]; char *error = NULL; char *uri = NULL; + char *tmp; prte_job_t *jdata; prte_proc_t *proc; prte_app_context_t *app; @@ -212,6 +213,38 @@ int prte_ess_base_prted_setup(void) goto error; } } + + /* Setup the job data object for the daemons */ + /* create and store the job data object */ + jdata = PMIX_NEW(prte_job_t); + PMIX_LOAD_NSPACE(jdata->nspace, PRTE_PROC_MY_NAME->nspace); + prte_set_job_data_object(jdata); + /* set the schizo personality to "prte" by default */ + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy("prte"); + if (NULL == jdata->schizo) { + pmix_show_help("help-schizo-base.txt", "no-proxy", true, prte_tool_basename, "prte"); + error = "select personality"; + ret = PRTE_ERR_SILENT; + goto error; + } + + /* every job requires at least one app */ + app = PMIX_NEW(prte_app_context_t); + pmix_pointer_array_set_item(jdata->apps, 0, app); + jdata->num_apps++; + + /* create and store a proc object for us */ + proc = PMIX_NEW(prte_proc_t); + PMIX_LOAD_PROCID(&proc->name, PRTE_PROC_MY_NAME->nspace, PRTE_PROC_MY_NAME->rank); + proc->pid = prte_process_info.pid; + proc->state = PRTE_PROC_STATE_RUNNING; + pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); + /* record that the daemon job is running */ + jdata->num_procs = 1; + jdata->state = PRTE_JOB_STATE_RUNNING; + /* obviously, we have "reported" */ + jdata->num_reported = 1; + /* setup my session directory here as the OOB may need it */ PMIX_OUTPUT_VERBOSE( (2, prte_ess_base_framework.framework_output, @@ -220,27 +253,19 @@ int prte_ess_base_prted_setup(void) (NULL == prte_process_info.tmpdir_base) ? "UNDEF" : prte_process_info.tmpdir_base, prte_process_info.nodename)); - /* take a pass thru the session directory code to fillin the - * tmpdir names - don't create anything yet - */ - if (PRTE_SUCCESS != (ret = prte_session_dir(false, PRTE_PROC_MY_NAME))) { - PRTE_ERROR_LOG(ret); - error = "prte_session_dir define"; - goto error; - } - /* clear the session directory just in case there are - * stale directories laying around - */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); - /* now actually create the directory tree */ - if (PRTE_SUCCESS != (ret = prte_session_dir(true, PRTE_PROC_MY_NAME))) { + /* create the directory tree */ + if (PRTE_SUCCESS != (ret = prte_session_dir(PRTE_PROC_MY_NAME))) { PRTE_ERROR_LOG(ret); error = "prte_session_dir"; goto error; } + /* set the pmix_output env file location to be in the * proc-specific session directory. */ - pmix_output_set_output_file_info(prte_process_info.proc_session_dir, "output-", NULL, NULL); + pmix_asprintf(&tmp, "%s/%s", jdata->session_dir, + PMIX_RANK_PRINT(PRTE_PROC_MY_NAME->rank)); + pmix_output_set_output_file_info(tmp, "output-", NULL, NULL); + free(tmp); /* setup stdout/stderr */ if (prte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to @@ -248,7 +273,8 @@ int prte_ess_base_prted_setup(void) */ /* define a log file name in the session directory */ - snprintf(log_file, PATH_MAX, "output-prted-%s-%s.log", prte_process_info.myproc.nspace, + snprintf(log_file, PATH_MAX, "output-prted-%s-%s.log", + prte_process_info.myproc.nspace, prte_process_info.nodename); log_path = pmix_os_path(false, prte_process_info.top_session_dir, log_file, NULL); @@ -267,39 +293,6 @@ int prte_ess_base_prted_setup(void) } } - /* Setup the job data object for the daemons */ - /* create and store the job data object */ - jdata = PMIX_NEW(prte_job_t); - PMIX_LOAD_NSPACE(jdata->nspace, PRTE_PROC_MY_NAME->nspace); - prte_set_job_data_object(jdata); - /* set the schizo personality to "prte" by default */ - jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy("prte"); - if (NULL == jdata->schizo) { - pmix_show_help("help-schizo-base.txt", "no-proxy", true, prte_tool_basename, "prte"); - error = "select personality"; - ret = PRTE_ERR_SILENT; - goto error; - } - - /* every job requires at least one app */ - app = PMIX_NEW(prte_app_context_t); - pmix_pointer_array_set_item(jdata->apps, 0, app); - jdata->num_apps++; - - /* create and store a proc object for us */ - proc = PMIX_NEW(prte_proc_t); - PMIX_LOAD_PROCID(&proc->name, PRTE_PROC_MY_NAME->nspace, PRTE_PROC_MY_NAME->rank); - proc->job = jdata; - proc->rank = proc->name.rank; - proc->pid = prte_process_info.pid; - proc->state = PRTE_PROC_STATE_RUNNING; - pmix_pointer_array_set_item(jdata->procs, proc->name.rank, proc); - /* record that the daemon job is running */ - jdata->num_procs = 1; - jdata->state = PRTE_JOB_STATE_RUNNING; - /* obviously, we have "reported" */ - jdata->num_reported = 1; - /* setup the PMIx server - we need this here in case the * communications infrastructure wants to register * information */ @@ -509,12 +502,10 @@ int prte_ess_base_prted_setup(void) return PRTE_SUCCESS; error: - pmix_show_help("help-prte-runtime.txt", "prte_init:startup:internal-failure", true, error, - PRTE_ERROR_NAME(ret), ret); + pmix_show_help("help-prte-runtime.txt", "prte_init:startup:internal-failure", true, + error, PRTE_ERROR_NAME(ret), ret); /* remove our use of the session directory tree */ - prte_session_dir_finalize(PRTE_PROC_MY_NAME); - /* ensure we scrub the session directory tree */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); + PMIX_RELEASE(jdata); return PRTE_ERR_SILENT; } @@ -559,10 +550,6 @@ int prte_ess_base_prted_finalize(void) (void) pmix_mca_base_framework_close(&prte_oob_base_framework); (void) pmix_mca_base_framework_close(&prte_prtereachable_base_framework); (void) pmix_mca_base_framework_close(&prte_state_base_framework); - /* remove our use of the session directory tree */ - prte_session_dir_finalize(PRTE_PROC_MY_NAME); - /* ensure we scrub the session directory tree */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); /* shutdown the pmix server */ pmix_server_finalize(); diff --git a/src/mca/ess/env/ess_env_module.c b/src/mca/ess/env/ess_env_module.c index e6f2d2a61d..bfb24c8335 100644 --- a/src/mca/ess/env/ess_env_module.c +++ b/src/mca/ess/env/ess_env_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -73,9 +73,10 @@ static int env_set_name(void); static int rte_init(int argc, char **argv); static int rte_finalize(void); -prte_ess_base_module_t prte_ess_env_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = NULL}; +prte_ess_base_module_t prte_ess_env_module = { + .init = rte_init, + .finalize = rte_finalize +}; static int rte_init(int argc, char **argv) { diff --git a/src/mca/ess/ess.h b/src/mca/ess/ess.h index cc7f20f36b..1214b145ad 100644 --- a/src/mca/ess/ess.h +++ b/src/mca/ess/ess.h @@ -14,7 +14,7 @@ * reserved. * Copyright (c) 2012-2020 Cisco Systems, Inc. All rights reserved * Copyright (c) 2018-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,26 +56,12 @@ typedef int (*prte_ess_base_module_init_fn_t)(int argc, char **argv); */ typedef int (*prte_ess_base_module_finalize_fn_t)(void); -/** - * Abort the current application - * - * Aborts currently running application, NOTE: We do NOT call the - * regular C-library "abort" function, even - * though that would have alerted us to the fact that this is - * an abnormal termination, because it would automatically cause - * a core file to be generated. The "report" flag indicates if the - * function should create an appropriate file to alert the local - * orted that termination was abnormal. - */ -typedef void (*prte_ess_base_module_abort_fn_t)(int status, bool report); - /* * the standard module data structure */ struct prte_ess_base_module_3_0_0_t { prte_ess_base_module_init_fn_t init; prte_ess_base_module_finalize_fn_t finalize; - prte_ess_base_module_abort_fn_t abort; }; typedef struct prte_ess_base_module_3_0_0_t prte_ess_base_module_3_0_0_t; typedef struct prte_ess_base_module_3_0_0_t prte_ess_base_module_t; diff --git a/src/mca/ess/hnp/ess_hnp_module.c b/src/mca/ess/hnp/ess_hnp_module.c index aee7e43558..36fef91498 100644 --- a/src/mca/ess/hnp/ess_hnp_module.c +++ b/src/mca/ess/hnp/ess_hnp_module.c @@ -17,7 +17,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -90,18 +90,19 @@ static int rte_init(int argc, char **argv); static int rte_finalize(void); -static void rte_abort(int status, bool report) __prte_attribute_noreturn__; -prte_ess_base_module_t prte_ess_hnp_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = rte_abort}; +prte_ess_base_module_t prte_ess_hnp_module = { + .init = rte_init, + .finalize = rte_finalize +}; static int rte_init(int argc, char **argv) { int ret; char *error = NULL; char *contact_path; - prte_job_t *jdata; + char *tmp; + prte_job_t *jdata = NULL; prte_node_t *node; prte_proc_t *proc; prte_app_context_t *app; @@ -169,26 +170,76 @@ static int rte_init(int argc, char **argv) goto error; } - /* setup my session directory here as the OOB may need it */ - PMIX_OUTPUT_VERBOSE( - (2, prte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", - PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - (NULL == prte_process_info.tmpdir_base) ? "UNDEF" : prte_process_info.tmpdir_base, - prte_process_info.nodename)); - /* take a pass thru the session directory code to fillin the - * tmpdir names - don't create anything yet - */ - if (PRTE_SUCCESS != (ret = prte_session_dir(false, PRTE_PROC_MY_NAME))) { - error = "prte_session_dir define"; + /* get the job data object for the daemons */ + jdata = PMIX_NEW(prte_job_t); + PMIX_LOAD_NSPACE(jdata->nspace, PRTE_PROC_MY_NAME->nspace); + ret = prte_set_job_data_object(jdata); + + /* set the schizo personality to "prte" by default */ + jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy("prte"); + if (NULL == jdata->schizo) { + pmix_show_help("help-schizo-base.txt", "no-proxy", true, prte_tool_basename, "prte"); + error = "select personality"; + ret = PRTE_ERR_SILENT; goto error; } - /* clear the session directory just in case there are - * stale directories laying around + + /* mark that the daemons have reported as we are the + * only ones in the system right now, and we definitely + * are running! */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); + jdata->state = PRTE_JOB_STATE_DAEMONS_REPORTED; + + /* every job requires at least one app */ + app = PMIX_NEW(prte_app_context_t); + app->app = strdup(argv[0]); + app->argv = PMIX_ARGV_COPY_COMPAT(argv); + app->job = (struct prte_job_t*)jdata; + pmix_pointer_array_set_item(jdata->apps, 0, app); + jdata->num_apps++; + /* create and store a node object where we are */ + node = PMIX_NEW(prte_node_t); + node->name = strdup(prte_process_info.nodename); + node->index = PRTE_PROC_MY_NAME->rank; + PRTE_FLAG_SET(node, PRTE_NODE_FLAG_LOC_VERIFIED); + pmix_pointer_array_set_item(prte_node_pool, PRTE_PROC_MY_NAME->rank, node); + + /* create and store a proc object for us */ + proc = PMIX_NEW(prte_proc_t); + PMIX_LOAD_PROCID(&proc->name, PRTE_PROC_MY_NAME->nspace, PRTE_PROC_MY_NAME->rank); + proc->pid = prte_process_info.pid; + proc->state = PRTE_PROC_STATE_RUNNING; + PMIX_RETAIN(node); /* keep accounting straight */ + proc->node = node; + pmix_pointer_array_set_item(jdata->procs, PRTE_PROC_MY_NAME->rank, proc); - /* now actually create the directory tree */ - if (PRTE_SUCCESS != (ret = prte_session_dir(true, PRTE_PROC_MY_NAME))) { + /* record that the daemon (i.e., us) is on this node + * NOTE: we do not add the proc object to the node's + * proc array because we are not an application proc. + * Instead, we record it in the daemon field of the + * node object + */ + PMIX_RETAIN(proc); /* keep accounting straight */ + node->daemon = proc; + PRTE_FLAG_SET(node, PRTE_NODE_FLAG_DAEMON_LAUNCHED); + node->state = PRTE_NODE_STATE_UP; + /* get our aliases - will include all the interface aliases captured in prte_init */ + node->aliases = PMIX_ARGV_COPY_COMPAT(prte_process_info.aliases); + /* record that the daemon job is running */ + jdata->num_procs = 1; + jdata->state = PRTE_JOB_STATE_RUNNING; + /* obviously, we have "reported" */ + jdata->num_reported = 1; + jdata->num_daemons_reported = 1; + + /* setup my session directory here as the OOB may need it */ + PMIX_OUTPUT_VERBOSE((2, prte_debug_output, + "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", + PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), + (NULL == prte_process_info.tmpdir_base) ? "UNDEF" : prte_process_info.tmpdir_base, + prte_process_info.nodename)); + /* create the directory tree */ + if (PRTE_SUCCESS != (ret = prte_session_dir(PRTE_PROC_MY_NAME))) { error = "prte_session_dir"; goto error; } @@ -229,6 +280,20 @@ static int rte_init(int argc, char **argv) goto error; } + // set our RML address + prte_oob_base_get_addr(&proc->rml_uri); + prte_process_info.my_hnp_uri = strdup(proc->rml_uri); + /* store it in the local PMIx repo for later retrieval */ + PMIX_VALUE_LOAD(&pval, proc->rml_uri, PMIX_STRING); + if (PMIX_SUCCESS != (pret = PMIx_Store_internal(PRTE_PROC_MY_NAME, PMIX_PROC_URI, &pval))) { + PMIX_ERROR_LOG(pret); + ret = PRTE_ERROR; + PMIX_VALUE_DESTRUCT(&pval); + error = "store uri"; + goto error; + } + PMIX_VALUE_DESTRUCT(&pval); + /* * Runtime Messaging Layer */ @@ -259,81 +324,6 @@ static int rte_init(int argc, char **argv) goto error; } - /* get the job data object for the daemons */ - jdata = PMIX_NEW(prte_job_t); - PMIX_LOAD_NSPACE(jdata->nspace, PRTE_PROC_MY_NAME->nspace); - prte_set_job_data_object(jdata); - - /* set the schizo personality to "prte" by default */ - jdata->schizo = (struct prte_schizo_base_module_t*)prte_schizo_base_detect_proxy("prte"); - if (NULL == jdata->schizo) { - pmix_show_help("help-schizo-base.txt", "no-proxy", true, prte_tool_basename, "prte"); - error = "select personality"; - ret = PRTE_ERR_SILENT; - goto error; - } - - /* mark that the daemons have reported as we are the - * only ones in the system right now, and we definitely - * are running! - */ - jdata->state = PRTE_JOB_STATE_DAEMONS_REPORTED; - - /* every job requires at least one app */ - app = PMIX_NEW(prte_app_context_t); - app->app = strdup(argv[0]); - app->argv = PMIX_ARGV_COPY_COMPAT(argv); - pmix_pointer_array_set_item(jdata->apps, 0, app); - jdata->num_apps++; - /* create and store a node object where we are */ - node = PMIX_NEW(prte_node_t); - node->name = strdup(prte_process_info.nodename); - node->index = PRTE_PROC_MY_NAME->rank; - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_LOC_VERIFIED); - pmix_pointer_array_set_item(prte_node_pool, PRTE_PROC_MY_NAME->rank, node); - - /* create and store a proc object for us */ - proc = PMIX_NEW(prte_proc_t); - PMIX_LOAD_PROCID(&proc->name, PRTE_PROC_MY_NAME->nspace, PRTE_PROC_MY_NAME->rank); - proc->job = jdata; - proc->rank = proc->name.rank; - proc->pid = prte_process_info.pid; - prte_oob_base_get_addr(&proc->rml_uri); - prte_process_info.my_hnp_uri = strdup(proc->rml_uri); - /* store it in the local PMIx repo for later retrieval */ - PMIX_VALUE_LOAD(&pval, proc->rml_uri, PMIX_STRING); - if (PMIX_SUCCESS != (pret = PMIx_Store_internal(PRTE_PROC_MY_NAME, PMIX_PROC_URI, &pval))) { - PMIX_ERROR_LOG(pret); - ret = PRTE_ERROR; - PMIX_VALUE_DESTRUCT(&pval); - error = "store uri"; - goto error; - } - PMIX_VALUE_DESTRUCT(&pval); - proc->state = PRTE_PROC_STATE_RUNNING; - PMIX_RETAIN(node); /* keep accounting straight */ - proc->node = node; - pmix_pointer_array_set_item(jdata->procs, PRTE_PROC_MY_NAME->rank, proc); - - /* record that the daemon (i.e., us) is on this node - * NOTE: we do not add the proc object to the node's - * proc array because we are not an application proc. - * Instead, we record it in the daemon field of the - * node object - */ - PMIX_RETAIN(proc); /* keep accounting straight */ - node->daemon = proc; - PRTE_FLAG_SET(node, PRTE_NODE_FLAG_DAEMON_LAUNCHED); - node->state = PRTE_NODE_STATE_UP; - /* get our aliases - will include all the interface aliases captured in prte_init */ - node->aliases = PMIX_ARGV_COPY_COMPAT(prte_process_info.aliases); - /* record that the daemon job is running */ - jdata->num_procs = 1; - jdata->state = PRTE_JOB_STATE_RUNNING; - /* obviously, we have "reported" */ - jdata->num_reported = 1; - jdata->num_daemons_reported = 1; - if (0 < pmix_output_get_verbosity(prte_ess_base_framework.framework_output)) { pmix_output(0, "ALIASES FOR %s", node->name); if (NULL != node->aliases) { @@ -430,13 +420,10 @@ static int rte_init(int argc, char **argv) /* set the pmix_output hnp file location to be in the * proc-specific session directory. */ - pmix_output_set_output_file_info(prte_process_info.proc_session_dir, "output-", NULL, NULL); - /* save my contact info in a file for others to find */ - if (NULL == prte_process_info.jobfam_session_dir) { - /* has to be set here! */ - PRTE_ERROR_LOG(PRTE_ERR_BAD_PARAM); - goto error; - } + pmix_asprintf(&tmp, "%s/%s", jdata->session_dir, + PMIX_RANK_PRINT(PRTE_PROC_MY_NAME->rank)); + pmix_output_set_output_file_info(tmp, "output-", NULL, NULL); + free(tmp); /* setup I/O forwarding system - must come after we init routes */ if (PRTE_SUCCESS @@ -472,36 +459,22 @@ static int rte_init(int argc, char **argv) pmix_show_help("help-prte-runtime.txt", "prte_init:startup:internal-failure", true, error, PRTE_ERROR_NAME(ret), ret); } - /* remove my contact info file, if we have session directories */ - if (NULL != prte_process_info.jobfam_session_dir) { - contact_path = pmix_os_path(false, prte_process_info.jobfam_session_dir, "contact.txt", - NULL); - unlink(contact_path); - free(contact_path); - } - /* remove our use of the session directory tree */ - prte_session_dir_finalize(PRTE_PROC_MY_NAME); - /* ensure we scrub the session directory tree */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); + if (NULL != jdata) { + /* remove our session directory tree */ + PMIX_RELEASE(jdata); + } return PRTE_ERR_SILENT; } static int rte_finalize(void) { char *contact_path; + prte_job_t *jdata; /* first stage shutdown of the errmgr, deregister the handler but keep * the required facilities until the rml and oob are offline */ prte_errmgr.finalize(); - /* remove my contact info file, if we have session directories */ - if (NULL != prte_process_info.jobfam_session_dir) { - contact_path = pmix_os_path(false, prte_process_info.jobfam_session_dir, "contact.txt", - NULL); - unlink(contact_path); - free(contact_path); - } - /* close frameworks */ (void) pmix_mca_base_framework_close(&prte_filem_base_framework); (void) pmix_mca_base_framework_close(&prte_grpcomm_base_framework); @@ -519,11 +492,6 @@ static int rte_finalize(void) (void) pmix_mca_base_framework_close(&prte_errmgr_base_framework); (void) pmix_mca_base_framework_close(&prte_state_base_framework); - /* remove our use of the session directory tree */ - prte_session_dir_finalize(PRTE_PROC_MY_NAME); - /* ensure we scrub the session directory tree */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); - free(prte_topo_signature); /* shutdown the pmix server */ @@ -534,27 +502,3 @@ static int rte_finalize(void) return PRTE_SUCCESS; } - -static void rte_abort(int status, bool report) -{ - PRTE_HIDE_UNUSED_PARAMS(report); - - pmix_output(0, "ABORT"); - /* do NOT do a normal finalize as this will very likely - * hang the process. We are aborting due to an abnormal condition - * that precludes normal cleanup - * - * We do need to do the following bits to make sure we leave a - * clean environment. Taken from prte_finalize(): - * - Assume errmgr cleans up child processes before we exit. - */ - - /* ensure we scrub the session directory tree */ - prte_session_dir_cleanup(PRTE_JOBID_WILDCARD); - /* - Clean out the global structures - * (not really necessary, but good practice) - */ - prte_proc_info_finalize(); - /* just exit */ - exit(status); -} diff --git a/src/mca/ess/lsf/ess_lsf_module.c b/src/mca/ess/lsf/ess_lsf_module.c index 5e152c8eee..dd24afde44 100644 --- a/src/mca/ess/lsf/ess_lsf_module.c +++ b/src/mca/ess/lsf/ess_lsf_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2016-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -50,9 +50,10 @@ static int lsf_set_name(void); static int rte_init(int argc, char **argv); static int rte_finalize(void); -prte_ess_base_module_t prte_ess_lsf_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = NULL}; +prte_ess_base_module_t prte_ess_lsf_module = { + .init = rte_init, + .finalize = rte_finalize +}; /* * Local variables diff --git a/src/mca/ess/slurm/ess_slurm_module.c b/src/mca/ess/slurm/ess_slurm_module.c index aeffbb3d6f..e4348247d0 100644 --- a/src/mca/ess/slurm/ess_slurm_module.c +++ b/src/mca/ess/slurm/ess_slurm_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,9 +53,10 @@ static int slurm_set_name(void); static int rte_init(int argc, char **argv); static int rte_finalize(void); -prte_ess_base_module_t prte_ess_slurm_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = NULL}; +prte_ess_base_module_t prte_ess_slurm_module = { + .init = rte_init, + .finalize = rte_finalize +}; static int rte_init(int argc, char **argv) { diff --git a/src/mca/ess/tm/ess_tm_module.c b/src/mca/ess/tm/ess_tm_module.c index 036e8be7e8..2043f22c01 100644 --- a/src/mca/ess/tm/ess_tm_module.c +++ b/src/mca/ess/tm/ess_tm_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2017-2020 Intel, Inc. All rights reserved. * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -52,9 +52,10 @@ static int tm_set_name(void); static int rte_init(int argc, char **argv); static int rte_finalize(void); -prte_ess_base_module_t prte_ess_tm_module = {.init = rte_init, - .finalize = rte_finalize, - .abort = NULL}; +prte_ess_base_module_t prte_ess_tm_module = { + .init = rte_init, + .finalize = rte_finalize +}; /* * Local variables diff --git a/src/mca/filem/raw/filem_raw_module.c b/src/mca/filem/raw/filem_raw_module.c index 4944cd7d8f..f97b5553bd 100644 --- a/src/mca/filem/raw/filem_raw_module.c +++ b/src/mca/filem/raw/filem_raw_module.c @@ -5,7 +5,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -93,17 +93,6 @@ static void recv_ack(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer prte_rml_tag_t tag, void *cbdata); static void write_handler(int fd, short event, void *cbdata); -static char *filem_session_dir(void) -{ - char *session_dir = prte_process_info.jobfam_session_dir; - if (NULL == session_dir) { - /* if no job family session dir was provided - - * use the job session dir */ - session_dir = prte_process_info.job_session_dir; - } - return session_dir; -} - static int raw_init(void) { PMIX_CONSTRUCT(&incoming_files, pmix_list_t); @@ -607,14 +596,11 @@ static int raw_link_local_files(prte_job_t *jdata, prte_app_context_t *app) pmix_list_item_t *item; char **files = NULL, *bname, *filestring; - /* check my jobfam session directory for files I have received and + /* check my job's session directory for files I have received and * symlink them to the proc-level session directory of each * local process in the job - * - * TODO: @rhc - please check that I've correctly interpret your - * intention here */ - session_dir = filem_session_dir(); + session_dir = jdata->session_dir; if (NULL == session_dir) { /* we were unable to find any suitable directory */ rc = PRTE_ERR_BAD_PARAM; @@ -662,8 +648,8 @@ static int raw_link_local_files(prte_job_t *jdata, prte_app_context_t *app) continue; } /* ignore children we have already handled */ - if (PRTE_FLAG_TEST(proc, PRTE_PROC_FLAG_ALIVE) - || (PRTE_PROC_STATE_INIT != proc->state && PRTE_PROC_STATE_RESTART != proc->state)) { + if (PRTE_FLAG_TEST(proc, PRTE_PROC_FLAG_ALIVE) || + (PRTE_PROC_STATE_INIT != proc->state && PRTE_PROC_STATE_RESTART != proc->state)) { continue; } @@ -672,18 +658,7 @@ static int raw_link_local_files(prte_job_t *jdata, prte_app_context_t *app) PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_NAME_PRINT(&proc->name))); /* get the session dir name in absolute form */ - path = prte_process_info.proc_session_dir; - - /* create it, if it doesn't already exist */ - if (PMIX_SUCCESS != (rc = pmix_os_dirpath_create(path, S_IRWXU))) { - PMIX_ERROR_LOG(rc); - /* doesn't exist with correct permissions, and/or we can't - * create it - either way, we are done - */ - free(files); - rc = prte_pmix_convert_status(rc); - return rc; - } + pmix_asprintf(&path, "%s/%s", session_dir, PMIX_RANK_PRINT(proc->name.rank)); /* cycle thru the incoming files */ for (item = pmix_list_get_first(&incoming_files); @@ -707,6 +682,7 @@ static int raw_link_local_files(prte_job_t *jdata, prte_app_context_t *app) != (rc = create_link(session_dir, path, inbnd->link_pts[j]))) { PRTE_ERROR_LOG(rc); free(files); + free(path); return rc; } } @@ -719,6 +695,7 @@ static int raw_link_local_files(prte_job_t *jdata, prte_app_context_t *app) } } } + free(path); } PMIX_ARGV_FREE_COMPAT(files); return PRTE_SUCCESS; @@ -1013,7 +990,7 @@ static void recv_files(int status, pmix_proc_t *sender, pmix_data_buffer_t *buff incoming->top = strdup(tmp); free(tmp); /* define the full path to where we will put it */ - session_dir = filem_session_dir(); + session_dir = prte_process_info.top_session_dir; incoming->fullpath = pmix_os_path(false, session_dir, file, NULL); diff --git a/src/mca/odls/base/odls_base_default_fns.c b/src/mca/odls/base/odls_base_default_fns.c index d5f7956731..4deee2d6b1 100644 --- a/src/mca/odls/base/odls_base_default_fns.c +++ b/src/mca/odls/base/odls_base_default_fns.c @@ -525,11 +525,6 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix PMIX_RETAIN(pptr); pmix_pointer_array_add(pptr->node->procs, pptr); pptr->node->num_procs++; - /* and connect it back to its job object, if not already done */ - if (NULL == pptr->job) { - PMIX_RETAIN(jdata); - pptr->job = jdata; - } } /* reset the mapped flags */ for (n = 0; n < jdata->map->nodes->size; n++) { @@ -578,7 +573,7 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix if (PRTE_PROC_IS_MASTER) { /* we don't want/need the extra copy of the prte_job_t, but * we can't just release it as that will NULL the location in - * the prte_job_data array. So set the jobid to INVALID to + * the prte_job_data array. So set its index to -1 to * protect the array, and then release the object to free * the storage */ jdata->index = -1; @@ -730,11 +725,6 @@ int prte_odls_base_default_construct_child_list(pmix_data_buffer_t *buffer, pmix PMIX_RETAIN(pptr); pmix_pointer_array_add(pptr->node->procs, pptr); pptr->node->num_procs++; - /* and connect it back to its job object, if not already done */ - if (NULL == pptr->job) { - PMIX_RETAIN(jdata); - pptr->job = jdata; - } } /* see if it belongs to us */ if (pptr->parent == PRTE_PROC_MY_NAME->rank) { @@ -834,14 +824,15 @@ static int setup_path(prte_app_context_t *app, char **wdir) char dir[MAXPATHLEN]; char *session_dir; bool usercwd = false; + prte_job_t *job; if (prte_get_attribute(&app->attributes, PRTE_APP_SSNDIR_CWD, NULL, PMIX_BOOL)) { /* move us to that location */ - session_dir = prte_process_info.jobfam_session_dir; + job = (prte_job_t*)app->job; + session_dir = job->session_dir; if (NULL == session_dir) { - /* if no job family session dir was provided - - * use the job session dir */ - session_dir = prte_process_info.job_session_dir; + // cannot do it + return PRTE_ERROR; } if (0 != chdir(session_dir)) { return PRTE_ERROR; @@ -987,7 +978,7 @@ void prte_odls_base_spawn_proc(int fd, short sd, void *cbdata) PRTE_FLAG_UNSET(child, PRTE_PROC_FLAG_WAITPID); /* setup the pmix environment */ - PMIX_LOAD_PROCID(&pproc, child->job->nspace, child->name.rank); + PMIX_LOAD_PROCID(&pproc, child->name.nspace, child->name.rank); if (PMIX_SUCCESS != (ret = PMIx_server_setup_fork(&pproc, &cd->env))) { PMIX_ERROR_LOG(ret); rc = PRTE_ERROR; @@ -1815,8 +1806,8 @@ int prte_odls_base_default_kill_local_procs(pmix_pointer_array_t *procs, * job could be given as a WILDCARD value, we must * check for that as well as for equality. */ - if (!PMIX_NSPACE_INVALID(proc->name.nspace) - && !PMIX_CHECK_NSPACE(proc->name.nspace, child->name.nspace)) { + if (!PMIX_NSPACE_INVALID(proc->name.nspace) && + !PMIX_CHECK_NSPACE(proc->name.nspace, child->name.nspace)) { PMIX_OUTPUT_VERBOSE((5, prte_odls_base_framework.framework_output, "%s odls:kill_local_proc child %s is not part of job %s", @@ -1898,8 +1889,6 @@ int prte_odls_base_default_kill_local_procs(pmix_pointer_array_t *procs, continue; CLEANUP: - /* ensure the child's session directory is cleaned up */ - prte_session_dir_finalize(&child->name); /* check for everything complete - this will remove * the child object from our local list */ @@ -1962,13 +1951,12 @@ int prte_odls_base_default_kill_local_procs(pmix_pointer_array_t *procs, cd->child->state = PRTE_PROC_STATE_KILLED_BY_CMD; /* we ordered it to die */ } - /* ensure the child's session directory is cleaned up */ - prte_session_dir_finalize(&cd->child->name); /* check for everything complete - this will remove * the child object from our local list */ - if (!prte_finalizing && PRTE_FLAG_TEST(cd->child, PRTE_PROC_FLAG_IOF_COMPLETE) - && PRTE_FLAG_TEST(cd->child, PRTE_PROC_FLAG_WAITPID)) { + if (!prte_finalizing && + PRTE_FLAG_TEST(cd->child, PRTE_PROC_FLAG_IOF_COMPLETE) && + PRTE_FLAG_TEST(cd->child, PRTE_PROC_FLAG_WAITPID)) { PRTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state); } } diff --git a/src/mca/oob/tcp/oob_tcp_listener.c b/src/mca/oob/tcp/oob_tcp_listener.c index 821a9c36d7..90cf611878 100644 --- a/src/mca/oob/tcp/oob_tcp_listener.c +++ b/src/mca/oob/tcp/oob_tcp_listener.c @@ -16,7 +16,7 @@ * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2015-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -800,7 +800,6 @@ static void connection_event_handler(int incoming_sd, short flags, void *cbdata) pmix_show_help("help-oob-tcp.txt", "accept failed", true, prte_process_info.nodename, prte_socket_errno, strerror(prte_socket_errno), "Out of file descriptors"); - prte_errmgr.abort(PRTE_ERROR_DEFAULT_EXIT_CODE, NULL); return; } diff --git a/src/mca/plm/base/plm_base_receive.c b/src/mca/plm/base/plm_base_receive.c index 5fce679cbc..740fc02764 100644 --- a/src/mca/plm/base/plm_base_receive.c +++ b/src/mca/plm/base/plm_base_receive.c @@ -17,7 +17,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -111,7 +111,8 @@ int prte_plm_base_comm_stop(void) } /* process incoming messages in order of receipt */ -void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buffer, +void prte_plm_base_recv(int status, pmix_proc_t *sender, + pmix_data_buffer_t *buffer, prte_rml_tag_t tag, void *cbdata) { prte_plm_cmd_flag_t command; @@ -232,7 +233,8 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf } /* get the parent's job object */ - if (NULL != (parent = prte_get_job_data_object(nptr->nspace))) { + if (NULL != (parent = prte_get_job_data_object(nptr->nspace)) && + !PMIX_CHECK_NSPACE(parent->nspace, PRTE_PROC_MY_NAME->nspace)) { /* link the spawned job to the spawner */ PMIX_RETAIN(jdata); pmix_list_append(&parent->children, &jdata->super); diff --git a/src/mca/rmaps/base/rmaps_base_support_fns.c b/src/mca/rmaps/base/rmaps_base_support_fns.c index 5aa6f473a9..833aea2262 100644 --- a/src/mca/rmaps/base/rmaps_base_support_fns.c +++ b/src/mca/rmaps/base/rmaps_base_support_fns.c @@ -14,7 +14,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2016-2021 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -540,7 +540,6 @@ prte_proc_t *prte_rmaps_base_setup_proc(prte_job_t *jdata, proc = PMIX_NEW(prte_proc_t); /* set the jobid */ PMIX_LOAD_NSPACE(proc->name.nspace, jdata->nspace); - proc->job = jdata; /* flag the proc as ready for launch */ proc->state = PRTE_PROC_STATE_INIT; proc->app_idx = idx; diff --git a/src/mca/rmaps/rank_file/rmaps_rank_file.c b/src/mca/rmaps/rank_file/rmaps_rank_file.c index 1bee7d5aa8..b8316e0a8e 100644 --- a/src/mca/rmaps/rank_file/rmaps_rank_file.c +++ b/src/mca/rmaps/rank_file/rmaps_rank_file.c @@ -19,7 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2022 IBM Corporation. All rights reserved. * - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -783,9 +783,9 @@ static int prte_rmaps_rf_lsf_convert_affinity_to_rankfile(char *affinity_file, c } // session dir + / (1) + lsf_rf. (7) + XXXXXX (6) + \0 (1) - len = strlen(prte_process_info.jobfam_session_dir) + 1 + 7 + 6 + 1; + len = strlen(prte_process_info.top_session_dir) + 1 + 7 + 6 + 1; (*aff_rankfile) = (char*) malloc(sizeof(char) * len); - sprintf(*aff_rankfile, "%s/lsf_rf.XXXXXX", prte_process_info.jobfam_session_dir); + sprintf(*aff_rankfile, "%s/lsf_rf.XXXXXX", prte_process_info.top_session_dir); /* open the file */ fp = fopen(affinity_file, "r"); diff --git a/src/mca/state/base/state_base_fns.c b/src/mca/state/base/state_base_fns.c index 4d529db09f..a67250e689 100644 --- a/src/mca/state/base/state_base_fns.c +++ b/src/mca/state/base/state_base_fns.c @@ -5,7 +5,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -544,12 +544,6 @@ void prte_state_base_track_procs(int fd, short argc, void *cbdata) PMIx_server_deregister_client(proc, opcbfunc, &lock); PRTE_PMIX_WAIT_THREAD(&lock); PRTE_PMIX_DESTRUCT_LOCK(&lock); - - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - prte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 5401fc7120..1551f8221b 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -4,7 +4,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,6 +28,7 @@ #include "src/util/pmix_os_dirpath.h" #include "src/util/pmix_output.h" #include "src/util/proc_info.h" +#include "src/util/session_dir.h" #include "src/mca/errmgr/errmgr.h" #include "src/mca/filem/filem.h" @@ -590,7 +591,7 @@ static void check_complete(int fd, short args, void *cbdata) PMIX_LOAD_PROCID(&pname, jdata->nspace, PMIX_RANK_WILDCARD); prte_pmix_server_clear(&pname); - /* cleanup the procs as these are gone */ + /* cleanup the local procs as these are gone */ for (i = 0; i < prte_local_children->size; i++) { if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(prte_local_children, i))) { continue; @@ -666,6 +667,7 @@ static void check_complete(int fd, short args, void *cbdata) PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); terminate_dvm = true; // flag that the DVM is to terminate PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_NOTIFY_COMPLETED); + PMIX_RELEASE(caddy); return; } @@ -834,15 +836,6 @@ static void check_complete(int fd, short args, void *cbdata) PMIX_DESTRUCT(&procs); } - /* remove the session directory tree */ - if (0 > pmix_asprintf(&tmp, "%s/%u", prte_process_info.jobfam_session_dir, - PRTE_LOCAL_JOBID(jdata->nspace))) { - PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); - } else { - pmix_os_dirpath_destroy(tmp, true, NULL); - free(tmp); - } - if (jdata->state != PRTE_JOB_STATE_NOTIFIED) { PMIX_OUTPUT_VERBOSE((2, prte_state_base_framework.framework_output, "%s state:dvm:check_job_completed state is terminated - activating notify", @@ -852,6 +845,7 @@ static void check_complete(int fd, short args, void *cbdata) jdata->state = PRTE_JOB_STATE_NOTIFIED; } + PMIX_POST_OBJECT(jdata); PMIX_RELEASE(caddy); } @@ -866,7 +860,9 @@ static void cleanup_job(int sd, short args, void *cbdata) dvm_terminated = true; prte_plm.terminate_orteds(); } - + if (NULL != caddy->jdata) { + PMIX_RELEASE(caddy->jdata); + } PMIX_RELEASE(caddy); } @@ -960,6 +956,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(ret); PMIX_INFO_FREE(info, ninfo); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); + PMIX_RELEASE(caddy); return; } /* pack the source - it cannot be me as that will cause @@ -969,6 +966,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(ret); PMIX_INFO_FREE(info, ninfo); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); + PMIX_RELEASE(caddy); return; } /* pack the range */ @@ -976,6 +974,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(ret); PMIX_INFO_FREE(info, ninfo); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); + PMIX_RELEASE(caddy); return; } /* pack the number of infos */ @@ -983,6 +982,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(ret); PMIX_INFO_FREE(info, ninfo); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); + PMIX_RELEASE(caddy); return; } /* pack the infos themselves */ @@ -990,6 +990,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(ret); PMIX_INFO_FREE(info, ninfo); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); + PMIX_RELEASE(caddy); return; } PMIX_INFO_FREE(info, ninfo); @@ -1003,6 +1004,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PMIX_ERROR_LOG(rc); PMIX_DATA_BUFFER_DESTRUCT(&pbkt); PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(caddy); return; } rc = PMIx_Data_copy_payload(reply, &pbkt); @@ -1011,6 +1013,7 @@ static void dvm_notify(int sd, short args, void *cbdata) if (PMIX_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_DATA_BUFFER_RELEASE(reply); + PMIX_RELEASE(caddy); return; } @@ -1023,6 +1026,7 @@ static void dvm_notify(int sd, short args, void *cbdata) PRTE_ERROR_LOG(rc); PMIX_DATA_BUFFER_RELEASE(reply); PMIX_PROC_FREE(sig.signature, 1); + PMIX_RELEASE(caddy); return; } PMIX_OUTPUT_VERBOSE((2, prte_state_base_framework.framework_output, @@ -1060,11 +1064,12 @@ static void dvm_notify(int sd, short args, void *cbdata) prte_grpcomm.xcast(&sig, PRTE_RML_TAG_DAEMON, reply); PMIX_DATA_BUFFER_RELEASE(reply); PMIX_PROC_FREE(sig.signature, 1); - PMIX_RELEASE(caddy); } // We are done with our use of job data and have notified the other daemons if (notify) { PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_NOTIFIED); } + + PMIX_RELEASE(caddy); } diff --git a/src/mca/state/prted/state_prted.c b/src/mca/state/prted/state_prted.c index 43a9789cbb..e0f039b695 100644 --- a/src/mca/state/prted/state_prted.c +++ b/src/mca/state/prted/state_prted.c @@ -4,8 +4,7 @@ * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2020-2021 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2022 Consulting. All rights reserved. - * Copyright (c) 2022-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -476,11 +475,6 @@ static void track_procs(int fd, short argc, void *cbdata) PRTE_FLAG_SET(pdata, PRTE_PROC_FLAG_RECORDED); PRTE_FLAG_UNSET(pdata, PRTE_PROC_FLAG_ALIVE); pdata->state = state; - /* Clean up the session directory as if we were the process - * itself. This covers the case where the process died abnormally - * and didn't cleanup its own session directory. - */ - prte_session_dir_finalize(proc); /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) diff --git a/src/mca/state/state.h b/src/mca/state/state.h index 61194f2cf6..9c6270ff99 100644 --- a/src/mca/state/state.h +++ b/src/mca/state/state.h @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2017-2019 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow diff --git a/src/prted/pmix/pmix_server.c b/src/prted/pmix/pmix_server.c index d723bd39e2..ccce0f98e6 100644 --- a/src/prted/pmix/pmix_server.c +++ b/src/prted/pmix/pmix_server.c @@ -18,7 +18,7 @@ * All rights reserved. * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * @@ -529,10 +529,6 @@ static void lost_connection_hdlr(size_t evhdlr_registration_id, pmix_status_t st PMIX_LIST_FOREACH(tl, &prte_pmix_server_globals.tools, prte_pmix_tool_t) { if (PMIX_CHECK_PROCID(&tl->name, source)) { - /* remove the session directory we created for it */ - if (NULL != tl->nsdir) { - pmix_os_dirpath_destroy(tl->nsdir, true, NULL); - } /* take this tool off the list */ pmix_list_remove_item(&prte_pmix_server_globals.tools, &tl->super); /* release it */ @@ -637,7 +633,7 @@ int pmix_server_init(void) /* tell the server our temp directory */ PMIX_INFO_LIST_ADD(prc, ilist, PMIX_SERVER_TMPDIR, - prte_process_info.jobfam_session_dir, + prte_process_info.top_session_dir, PMIX_STRING); if (PMIX_SUCCESS != prc) { PMIX_INFO_LIST_RELEASE(ilist); @@ -2024,16 +2020,6 @@ PMIX_CLASS_INSTANCE(pmix_server_pset_t, pmix_list_item_t, pscon, psdes); -static void tlcon(prte_pmix_tool_t *p) -{ - p->nsdir = NULL; -} -static void tldes(prte_pmix_tool_t *p) -{ - if (NULL != p->nsdir) { - free(p->nsdir); - } -} PMIX_CLASS_INSTANCE(prte_pmix_tool_t, pmix_list_item_t, - tlcon, tldes); + NULL, NULL); diff --git a/src/prted/pmix/pmix_server_dyn.c b/src/prted/pmix/pmix_server_dyn.c index 992f172fcc..3720158df6 100644 --- a/src/prted/pmix/pmix_server_dyn.c +++ b/src/prted/pmix/pmix_server_dyn.c @@ -18,7 +18,7 @@ * All rights reserved. * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -242,6 +242,7 @@ static void interim(int sd, short args, void *cbdata) for (n = 0; n < cd->napps; n++) { papp = &cd->apps[n]; app = PMIX_NEW(prte_app_context_t); + app->job = (struct prte_job_t*)jdata; app->idx = pmix_pointer_array_add(jdata->apps, app); jdata->num_apps++; if (NULL != papp->cmd) { diff --git a/src/prted/pmix/pmix_server_internal.h b/src/prted/pmix/pmix_server_internal.h index cde95efc7d..4508c34b98 100644 --- a/src/prted/pmix/pmix_server_internal.h +++ b/src/prted/pmix/pmix_server_internal.h @@ -18,7 +18,7 @@ * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -135,7 +135,6 @@ PMIX_CLASS_DECLARATION(prte_pmix_server_op_caddy_t); typedef struct { pmix_list_item_t super; pmix_proc_t name; - char *nsdir; } prte_pmix_tool_t; PMIX_CLASS_DECLARATION(prte_pmix_tool_t); diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index 396efd1dd2..e46fffb60e 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -19,7 +19,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017-2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -46,6 +46,7 @@ #include "src/util/error.h" #include "src/util/pmix_os_dirpath.h" #include "src/util/pmix_output.h" +#include "src/util/pmix_printf.h" #include "types.h" #include "src/mca/errmgr/errmgr.h" @@ -53,6 +54,7 @@ #include "src/runtime/prte_globals.h" #include "src/runtime/prte_wait.h" #include "src/util/name_fns.h" +#include "src/util/session_dir.h" #include "src/prted/pmix/pmix_server.h" #include "src/prted/pmix/pmix_server_internal.h" @@ -97,7 +99,8 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) pmix_info_t *iptr; void *next; - pmix_output_verbose(2, prte_pmix_server_globals.output, "%s register nspace for %s", + pmix_output_verbose(2, prte_pmix_server_globals.output, + "%s register nspace for %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_JOBID_PRINT(jdata->nspace)); /* setup the info list */ @@ -319,25 +322,20 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) /* tell the user what we did with FQDN */ PMIX_INFO_LIST_ADD(ret, info, PMIX_HOSTNAME_KEEP_FQDN, &prte_keep_fqdn_hostnames, PMIX_BOOL); - /* pass the top-level session directory - this is our jobfam session dir */ - PMIX_INFO_LIST_ADD(ret, info, PMIX_TMPDIR, prte_process_info.jobfam_session_dir, PMIX_STRING); + /* pass the top-level session directory */ + PMIX_INFO_LIST_ADD(ret, info, PMIX_TMPDIR, prte_process_info.top_session_dir, PMIX_STRING); /* create and pass a job-level session directory */ - if (0 > pmix_asprintf(&tmp, "%s/%u", prte_process_info.jobfam_session_dir, - PRTE_LOCAL_JOBID(jdata->nspace))) { - PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); - PMIX_INFO_LIST_RELEASE(info); - return PRTE_ERR_OUT_OF_RESOURCE; - } - rc = pmix_os_dirpath_create(prte_process_info.jobfam_session_dir, S_IRWXU); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); + pproc.rank = PMIX_RANK_INVALID; + rc = prte_session_dir(&pproc); + if (PRTE_SUCCESS != rc) { + PRTE_ERROR_LOG(rc); PMIX_INFO_LIST_RELEASE(info); rc = prte_pmix_convert_status(rc); return rc; } - PMIX_INFO_LIST_ADD(ret, info, PMIX_NSDIR, tmp, PMIX_STRING); - free(tmp); + // job session dir will have been stored in the jdata object + PMIX_INFO_LIST_ADD(ret, info, PMIX_NSDIR, jdata->session_dir, PMIX_STRING); /* check for output directives */ fptr = &flag; @@ -537,20 +535,16 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) } if (PRTE_PROC_MY_NAME->rank == node->daemon->name.rank) { /* create and pass a proc-level session directory */ - if (0 > pmix_asprintf(&tmp, "%s/%u/%u", prte_process_info.jobfam_session_dir, - PRTE_LOCAL_JOBID(jdata->nspace), pptr->name.rank)) { - PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); - PMIX_INFO_LIST_RELEASE(info); - PMIX_INFO_LIST_RELEASE(pmap); - return PRTE_ERR_OUT_OF_RESOURCE; - } - if (PMIX_SUCCESS != (rc = pmix_os_dirpath_create(tmp, S_IRWXU))) { + rc = prte_session_dir(&pptr->name); + if (PRTE_SUCCESS != rc) { PMIX_ERROR_LOG(rc); PMIX_INFO_LIST_RELEASE(info); PMIX_INFO_LIST_RELEASE(pmap); rc = prte_pmix_convert_status(rc); return rc; } + pmix_asprintf(&tmp, "%s/%s", jdata->session_dir, + PMIX_RANK_PRINT(pptr->name.rank)); PMIX_INFO_LIST_ADD(ret, pmap, PMIX_PROCDIR, tmp, PMIX_STRING); free(tmp); } @@ -738,71 +732,27 @@ static void opcbfunc(pmix_status_t status, void *cbdata) /* add any info that the tool couldn't self-assign */ int prte_pmix_server_register_tool(pmix_nspace_t nspace) { - void *ilist; pmix_status_t ret; - char *tmp; - pmix_data_array_t darray; - pmix_info_t *iptr; - size_t ninfo; prte_pmix_lock_t lock; int rc; prte_pmix_tool_t *tl; - PMIX_INFO_LIST_START(ilist); - - PMIX_INFO_LIST_ADD(ret, ilist, PMIX_TMPDIR, - prte_process_info.jobfam_session_dir, PMIX_STRING); - - /* create and pass a job-level session directory */ - if (0 > pmix_asprintf(&tmp, "%s/%u", prte_process_info.jobfam_session_dir, - PRTE_LOCAL_JOBID(nspace))) { - PRTE_ERROR_LOG(PRTE_ERR_OUT_OF_RESOURCE); - return PRTE_ERR_OUT_OF_RESOURCE; - } - rc = pmix_os_dirpath_create(tmp, S_IRWXU); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - free(tmp); - rc = prte_pmix_convert_status(rc); - return rc; - } - PMIX_INFO_LIST_ADD(ret, ilist, PMIX_NSDIR, tmp, PMIX_STRING); - /* record this tool */ tl = PMIX_NEW(prte_pmix_tool_t); PMIX_LOAD_PROCID(&tl->name, nspace, 0); - tl->nsdir = tmp; pmix_list_append(&prte_pmix_server_globals.tools, &tl->super); - /* pass it down */ - PMIX_INFO_LIST_CONVERT(ret, ilist, &darray); - if (PMIX_ERR_EMPTY == ret) { - iptr = NULL; - ninfo = 0; - } else if (PMIX_SUCCESS != ret) { - PMIX_ERROR_LOG(ret); - rc = prte_pmix_convert_status(ret); - PMIX_INFO_LIST_RELEASE(ilist); - return rc; - } else { - iptr = (pmix_info_t *) darray.array; - ninfo = darray.size; - } - PMIX_INFO_LIST_RELEASE(ilist); - PRTE_PMIX_CONSTRUCT_LOCK(&lock); - ret = PMIx_server_register_nspace(nspace, 1, iptr, ninfo, + ret = PMIx_server_register_nspace(nspace, 1, NULL, 0, opcbfunc, &lock); if (PMIX_SUCCESS != ret) { PMIX_ERROR_LOG(ret); rc = prte_pmix_convert_status(ret); - PMIX_INFO_FREE(iptr, ninfo); PRTE_PMIX_DESTRUCT_LOCK(&lock); return rc; } PRTE_PMIX_WAIT_THREAD(&lock); rc = lock.status; PRTE_PMIX_DESTRUCT_LOCK(&lock); - PMIX_INFO_FREE(iptr, ninfo); return rc; } diff --git a/src/prted/prted_comm.c b/src/prted/prted_comm.c index 4a15eb0e24..6466f704d4 100644 --- a/src/prted/prted_comm.c +++ b/src/prted/prted_comm.c @@ -18,7 +18,7 @@ * Copyright (c) 2016-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -501,15 +501,7 @@ void prte_daemon_recv(int status, pmix_proc_t *sender, /* cleanup any pending server ops */ PMIX_LOAD_PROCID(&pname, job, PMIX_RANK_WILDCARD); prte_pmix_server_clear(&pname); - /* remove the session directory tree */ - if (0 > pmix_asprintf(&cmd_str, "%s/%d", prte_process_info.jobfam_session_dir, - PRTE_LOCAL_JOBID(jdata->nspace))) { - ret = PRTE_ERR_OUT_OF_RESOURCE; - goto CLEANUP; - } - pmix_os_dirpath_destroy(cmd_str, true, NULL); - free(cmd_str); - cmd_str = NULL; + PMIX_RELEASE(jdata); break; diff --git a/src/runtime/data_type_support/prte_dt_print_fns.c b/src/runtime/data_type_support/prte_dt_print_fns.c index d8aea41942..5916e4b1e3 100644 --- a/src/runtime/data_type_support/prte_dt_print_fns.c +++ b/src/runtime/data_type_support/prte_dt_print_fns.c @@ -14,7 +14,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * @@ -225,7 +225,7 @@ void prte_node_print(char **output, prte_job_t *jdata, prte_node_t *src) if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(src->procs, j))) { continue; } - if (proc->job != jdata) { + if (!PMIX_CHECK_NSPACE(proc->name.nspace, jdata->nspace)) { continue; } prte_proc_print(&tmp2, jdata, proc); diff --git a/src/runtime/prte_finalize.c b/src/runtime/prte_finalize.c index 68e4be855d..1e746ac517 100644 --- a/src/runtime/prte_finalize.c +++ b/src/runtime/prte_finalize.c @@ -16,7 +16,7 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2022 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights * reserved. * $COPYRIGHT$ @@ -48,8 +48,13 @@ int prte_finalize(void) { - int rc, n; + int rc, n, i; prte_job_t *jdata = NULL, *child_jdata = NULL, *next_jdata = NULL; + prte_app_context_t *app; + prte_proc_t *p; + pmix_pointer_array_t *array; + prte_node_t *node; + prte_topology_t *topo; PMIX_ACQUIRE_THREAD(&prte_init_lock); if (!prte_initialized) { @@ -70,17 +75,23 @@ int prte_finalize(void) /* release the cache */ PMIX_RELEASE(prte_cache); - /* Release the job hash table - * - * There is the potential for a prte_job_t object to still be in the - * children list of another prte_job_t object, both objects stored in the - * prte_job_data array. If this happens then an assert will be raised - * when the first prte_job_t object is released when iterating over the - * prte_job_data structure. Therefore, we traverse the children list of - * every prte_job_t in the prte_job_data hash, removing all children - * references before iterating over the prte_job_data hash table to - * release the prte_job_t objects. - */ + /* call the finalize function for this environment */ + if (PRTE_SUCCESS != (rc = prte_ess.finalize())) { + return rc; + } + (void) pmix_mca_base_framework_close(&prte_ess_base_framework); + + // clean up the node array + for (n = 0; n < prte_node_pool->size; n++) { + node = (prte_node_t *) pmix_pointer_array_get_item(prte_node_pool, n); + if (NULL == node) { + continue; + } + pmix_pointer_array_set_item(prte_node_pool, n, NULL); + PMIX_RELEASE(node); + } + PMIX_RELEASE(prte_node_pool); + for (n = 0; n < prte_job_data->size; n++) { jdata = (prte_job_t *) pmix_pointer_array_get_item(prte_job_data, n); if (NULL == jdata) { @@ -93,61 +104,42 @@ int prte_finalize(void) { pmix_list_remove_item(&jdata->children, &child_jdata->super); } + /* clean up any app contexts as they refcount the jdata object */ + for (i=0; i < jdata->apps->size; i++) { + app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, i); + if (NULL != app) { + pmix_pointer_array_set_item(jdata->apps, i, NULL); + PMIX_RELEASE(app); + } + } + // clean up any procs + for (i=0; i < jdata->procs->size; i++) { + p = (prte_proc_t*)pmix_pointer_array_get_item(jdata->procs, i); + if (NULL != p) { + pmix_pointer_array_set_item(jdata->procs, i, NULL); + PMIX_RELEASE(p); + } + } + pmix_pointer_array_set_item(prte_job_data, n, NULL); PMIX_RELEASE(jdata); } PMIX_RELEASE(prte_job_data); - { - pmix_pointer_array_t *array = prte_node_topologies; - int i; - if (array->number_free != array->size) { - array->lowest_free = 0; - array->number_free = array->size; - for (i = 0; i < array->size; i++) { - if (NULL != array->addr[i]) { - prte_topology_t *topo = (prte_topology_t *) array->addr[i]; - topo->topo = NULL; - PMIX_RELEASE(topo); - } - array->addr[i] = NULL; - } + for (n = 0; n < prte_node_topologies->size; n++) { + topo = (prte_topology_t *) pmix_pointer_array_get_item(prte_node_topologies, n); + if (NULL == topo) { + continue; } + pmix_pointer_array_set_item(prte_node_topologies, n, NULL); + PMIX_RELEASE(topo); } PMIX_RELEASE(prte_node_topologies); - { - pmix_pointer_array_t *array = prte_node_pool; - int i; - prte_node_t *node; - if (array->number_free != array->size) { - array->lowest_free = 0; - array->number_free = array->size; - for (i = 0; i < array->size; i++) { - if (NULL != array->addr[i]) { - node = (prte_node_t *) array->addr[i]; - if (NULL != node) { - if (NULL != node->daemon) { - PMIX_RELEASE(node->daemon); - } - PMIX_RELEASE(node); - } - } - array->addr[i] = NULL; - } - } - } - PMIX_RELEASE(prte_node_pool); - /* Close the general debug stream */ pmix_output_close(prte_debug_output); pmix_mca_base_alias_cleanup(); - /* call the finalize function for this environment */ - if (PRTE_SUCCESS != (rc = prte_ess.finalize())) { - return rc; - } - (void) pmix_mca_base_framework_close(&prte_ess_base_framework); prte_proc_info_finalize(); pmix_output_finalize(); diff --git a/src/runtime/prte_globals.c b/src/runtime/prte_globals.c index 2ec09b7681..b277554273 100644 --- a/src/runtime/prte_globals.c +++ b/src/runtime/prte_globals.c @@ -17,7 +17,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017-2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,11 +43,14 @@ #include "src/mca/errmgr/errmgr.h" #include "src/mca/rmaps/rmaps.h" #include "src/rml/rml.h" +#include "src/mca/state/state.h" + #include "src/util/pmix_argv.h" #include "src/util/name_fns.h" #include "src/util/pmix_net.h" #include "src/util/pmix_output.h" #include "src/util/proc_info.h" +#include "src/util/session_dir.h" #include "src/runtime/prte_globals.h" #include "src/runtime/runtime.h" @@ -404,6 +407,7 @@ bool prte_nptr_match(prte_node_t *n1, prte_node_t *n2) static void prte_app_context_construct(prte_app_context_t *app_context) { + app_context->job = NULL; app_context->idx = 0; app_context->app = NULL; app_context->num_procs = 0; @@ -465,6 +469,7 @@ static void prte_job_construct(prte_job_t *job) job->personality = NULL; job->schizo = NULL; PMIX_LOAD_NSPACE(job->nspace, NULL); + job->session_dir = NULL; job->index = -1; job->offset = 0; job->apps = PMIX_NEW(pmix_pointer_array_t); @@ -516,14 +521,10 @@ static void prte_job_destruct(prte_job_t *job) return; } - if (prte_debug_flag) { - pmix_output(0, "%s Releasing job data for %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), - PRTE_JOBID_PRINT(job->nspace)); - } - if (NULL != job->personality) { PMIX_ARGV_FREE_COMPAT(job->personality); } + for (n = 0; n < job->apps->size; n++) { if (NULL == (app = (prte_app_context_t *) pmix_pointer_array_get_item(job->apps, n))) { continue; @@ -568,6 +569,7 @@ static void prte_job_destruct(prte_job_t *job) if (NULL == (proc = (prte_proc_t *) pmix_pointer_array_get_item(job->procs, n))) { continue; } + pmix_pointer_array_set_item(job->procs, n, NULL); PMIX_RELEASE(proc); } PMIX_RELEASE(job->procs); @@ -585,6 +587,14 @@ static void prte_job_destruct(prte_job_t *job) PMIX_LIST_DESTRUCT(&job->children); + if (NULL != job->session_dir) { + prte_job_session_dir_finalize(job); + if (NULL != job->session_dir) { + free(job->session_dir); + job->session_dir = NULL; + } + } + if (NULL != prte_job_data && 0 <= job->index) { /* remove the job from the global array */ pmix_pointer_array_set_item(prte_job_data, job->index, NULL); @@ -669,13 +679,12 @@ static void prte_node_destruct(prte_node_t *node) PMIX_LIST_DESTRUCT(&node->attributes); } -PMIX_CLASS_INSTANCE(prte_node_t, pmix_list_item_t, prte_node_construct, prte_node_destruct); +PMIX_CLASS_INSTANCE(prte_node_t, pmix_list_item_t, + prte_node_construct, prte_node_destruct); static void prte_proc_construct(prte_proc_t *proc) { proc->name = *PRTE_NAME_INVALID; - proc->job = NULL; - proc->rank = PMIX_RANK_INVALID; proc->parent = PMIX_RANK_INVALID; proc->pid = 0; proc->local_rank = PRTE_LOCAL_RANK_INVALID; @@ -712,7 +721,8 @@ static void prte_proc_destruct(prte_proc_t *proc) PMIX_LIST_DESTRUCT(&proc->attributes); } -PMIX_CLASS_INSTANCE(prte_proc_t, pmix_list_item_t, prte_proc_construct, prte_proc_destruct); +PMIX_CLASS_INSTANCE(prte_proc_t, pmix_list_item_t, + prte_proc_construct, prte_proc_destruct); static void prte_job_map_construct(prte_job_map_t *map) { @@ -750,7 +760,8 @@ static void prte_job_map_destruct(prte_job_map_t *map) PMIX_RELEASE(map->nodes); } -PMIX_CLASS_INSTANCE(prte_job_map_t, pmix_object_t, prte_job_map_construct, prte_job_map_destruct); +PMIX_CLASS_INSTANCE(prte_job_map_t, pmix_object_t, + prte_job_map_construct, prte_job_map_destruct); static void prte_attr_cons(prte_attribute_t *p) { @@ -762,7 +773,8 @@ static void prte_attr_des(prte_attribute_t *p) { PMIX_VALUE_DESTRUCT(&p->data); } -PMIX_CLASS_INSTANCE(prte_attribute_t, pmix_list_item_t, prte_attr_cons, prte_attr_des); +PMIX_CLASS_INSTANCE(prte_attribute_t, pmix_list_item_t, + prte_attr_cons, prte_attr_des); static void tcon(prte_topology_t *t) { @@ -778,7 +790,8 @@ static void tdes(prte_topology_t *t) free(t->sig); } } -PMIX_CLASS_INSTANCE(prte_topology_t, pmix_object_t, tcon, tdes); +PMIX_CLASS_INSTANCE(prte_topology_t, pmix_object_t, + tcon, tdes); #if PRTE_PICKY_COMPILERS void prte_hide_unused_params(int x, ...) diff --git a/src/runtime/prte_globals.h b/src/runtime/prte_globals.h index e2f795b5ed..7dd57cbe12 100644 --- a/src/runtime/prte_globals.h +++ b/src/runtime/prte_globals.h @@ -17,7 +17,7 @@ * Copyright (c) 2017-2020 IBM Corporation. All rights reserved. * Copyright (c) 2017-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -191,6 +191,7 @@ typedef uint16_t prte_job_controls_t; * defining it - resolves potential circular definition */ struct prte_proc_t; +struct prte_job_t; struct prte_job_map_t; struct prte_schizo_base_module_t; @@ -211,6 +212,8 @@ PRTE_EXPORT PMIX_CLASS_DECLARATION(prte_topology_t); typedef struct { /** Parent object */ pmix_object_t super; + /** the job this app belongs to */ + struct prte_job_t *job; /** Unique index when multiple apps per job */ prte_app_idx_t idx; /** Absolute pathname of argv[0] */ @@ -312,6 +315,8 @@ typedef struct { struct prte_schizo_base_module_t *schizo; /* jobid for this job */ pmix_nspace_t nspace; + // session directory for this job + char *session_dir; int index; // index in the job array where this is stored /* offset to the total number of procs so shared memory * components can potentially connect to any spawned jobs*/ @@ -377,8 +382,6 @@ struct prte_proc_t { pmix_list_item_t super; /* process name */ pmix_proc_t name; - prte_job_t *job; - pmix_rank_t rank; /* the vpid of my parent - the daemon vpid for an app * or the vpid of the parent in the routing tree of * a daemon */ diff --git a/src/runtime/prte_mca_params.c b/src/runtime/prte_mca_params.c index 26d1ea3491..090968a3a6 100644 --- a/src/runtime/prte_mca_params.c +++ b/src/runtime/prte_mca_params.c @@ -17,7 +17,7 @@ * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,7 +55,6 @@ static char *prte_tmpdir_base = NULL; static char *prte_local_tmpdir_base = NULL; static char *prte_remote_tmpdir_base = NULL; static char *prte_top_session_dir = NULL; -static char *prte_jobfam_session_dir = NULL; static char *local_setup_slots = NULL; char *prte_signal_string = NULL; diff --git a/src/tools/prte/prte.c b/src/tools/prte/prte.c index 389c49a950..f7a66cc052 100644 --- a/src/tools/prte/prte.c +++ b/src/tools/prte/prte.c @@ -19,7 +19,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 Geoffroy Vallee. All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All Rights * reserved. * Copyright (c) 2022-2023 Triad National Security, LLC. All rights @@ -1376,7 +1376,7 @@ static void abort_signal_callback(int fd) second = false; } else { surekill(); // ensure we attempt to kill everything - pmix_os_dirpath_destroy(prte_process_info.jobfam_session_dir, true, NULL); + pmix_os_dirpath_destroy(prte_process_info.top_session_dir, true, NULL); exit(1); } } @@ -1428,16 +1428,12 @@ static int prep_singleton(const char *name) /* create a proc for the singleton */ proc = PMIX_NEW(prte_proc_t); PMIX_LOAD_PROCID(&proc->name, jdata->nspace, rank); - proc->rank = proc->name.rank; proc->parent = PRTE_PROC_MY_NAME->rank; proc->app_idx = 0; proc->app_rank = rank; proc->local_rank = 0; proc->node_rank = 0; proc->state = PRTE_PROC_STATE_RUNNING; - /* link it to the job */ - PMIX_RETAIN(jdata); - proc->job = jdata; /* link it to the app */ PMIX_RETAIN(proc); pmix_pointer_array_set_item(&app->procs, rank, proc); diff --git a/src/util/proc_info.c b/src/util/proc_info.c index dade232108..0923b962eb 100644 --- a/src/util/proc_info.c +++ b/src/util/proc_info.c @@ -14,7 +14,7 @@ * All rights reserved. * Copyright (c) 2014-2020 Intel, Inc. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,10 +53,10 @@ extern bool prte_keep_fqdn_hostnames; PRTE_EXPORT prte_process_info_t prte_process_info = { - .myproc = {{0}, 0}, - .my_hnp = {{0}, 0}, + .myproc = PMIX_PROC_STATIC_INIT, + .my_hnp = PMIX_PROC_STATIC_INIT, .my_hnp_uri = NULL, - .my_parent = {{0}, 0}, + .my_parent = PMIX_PROC_STATIC_INIT, .hnp_pid = 0, .num_daemons = 1, .num_nodes = 1, @@ -65,15 +65,8 @@ PRTE_EXPORT prte_process_info_t prte_process_info = { .pid = 0, .proc_type = PRTE_PROC_TYPE_NONE, .my_port = 0, - .num_restarts = 0, .tmpdir_base = NULL, .top_session_dir = NULL, - .jobfam_session_dir = NULL, - .job_session_dir = NULL, - .proc_session_dir = NULL, - .sock_stdin = NULL, - .sock_stdout = NULL, - .sock_stderr = NULL, .cpuset = NULL, .shared_fs = false }; @@ -225,13 +218,6 @@ int prte_proc_info(void) PMIX_MCA_BASE_VAR_TYPE_INT, &prte_process_info.num_nodes); - /* get the number of times this proc has restarted */ - prte_process_info.num_restarts = 0; - (void) pmix_mca_base_var_register("prte", "prte", NULL, "num_restarts", - "Number of times this proc has restarted", - PMIX_MCA_BASE_VAR_TYPE_INT, - &prte_process_info.num_restarts); - return PRTE_SUCCESS; } @@ -251,21 +237,6 @@ int prte_proc_info_finalize(void) prte_process_info.top_session_dir = NULL; } - if (NULL != prte_process_info.jobfam_session_dir) { - free(prte_process_info.jobfam_session_dir); - prte_process_info.jobfam_session_dir = NULL; - } - - if (NULL != prte_process_info.job_session_dir) { - free(prte_process_info.job_session_dir); - prte_process_info.job_session_dir = NULL; - } - - if (NULL != prte_process_info.proc_session_dir) { - free(prte_process_info.proc_session_dir); - prte_process_info.proc_session_dir = NULL; - } - if (NULL != prte_process_info.nodename) { free(prte_process_info.nodename); prte_process_info.nodename = NULL; @@ -276,21 +247,6 @@ int prte_proc_info_finalize(void) prte_process_info.cpuset = NULL; } - if (NULL != prte_process_info.sock_stdin) { - free(prte_process_info.sock_stdin); - prte_process_info.sock_stdin = NULL; - } - - if (NULL != prte_process_info.sock_stdout) { - free(prte_process_info.sock_stdout); - prte_process_info.sock_stdout = NULL; - } - - if (NULL != prte_process_info.sock_stderr) { - free(prte_process_info.sock_stderr); - prte_process_info.sock_stderr = NULL; - } - prte_process_info.proc_type = PRTE_PROC_TYPE_NONE; PMIX_ARGV_FREE_COMPAT(prte_process_info.aliases); diff --git a/src/util/proc_info.h b/src/util/proc_info.h index c660867c47..85e53ba5af 100644 --- a/src/util/proc_info.h +++ b/src/util/proc_info.h @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2013-2020 Intel, Inc. All rights reserved. * Copyright (c) 2017-2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2023 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,7 +76,6 @@ typedef struct prte_process_info_t { pid_t pid; /**< Local process ID for this process */ prte_proc_type_t proc_type; /**< Type of process */ uint16_t my_port; /**< TCP port for out-of-band comm */ - int num_restarts; /**< number of times this proc has restarted */ /* The session directory has the form * ///, where the prefix * can either be provided by the user via the @@ -85,15 +84,8 @@ typedef struct prte_process_info_t { */ char *tmpdir_base; /**< Base directory of the session dir tree */ char *top_session_dir; /**< Top-most directory of the session tree */ - char *jobfam_session_dir; /**< Session directory for this family of jobs (i.e., share same - mpirun) */ - char *job_session_dir; /**< Session directory for job */ - char *proc_session_dir; /**< Session directory for the process */ bool rm_session_dirs; /**< Session directories will be cleaned up by RM */ - char *sock_stdin; /**< Path name to temp file for stdin. */ - char *sock_stdout; /**< Path name to temp file for stdout. */ - char *sock_stderr; /**< Path name to temp file for stderr. */ char *cpuset; /**< String-representation of bitmap where we are bound */ bool shared_fs; // whether the tmpdir is on a shared file system } prte_process_info_t; diff --git a/src/util/session_dir.h b/src/util/session_dir.h index 75c2404e64..6433b89b97 100644 --- a/src/util/session_dir.h +++ b/src/util/session_dir.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2016-2020 Intel, Inc. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -22,69 +22,6 @@ * * Find and/or create PRTE session directory. * - * The prte_session_dir() function searches for a temporary directory - * that is used by the PRTE system for storing system-critical - * information. For a given system and user, the function attempts to - * find (or create, if not found and create is requested) a directory - * that will be used to independently house information for multiple - * universes, as the user creates them. Thus, the function pursues a - * directory tree of the form: - * - * \par \em [prefix-dir] An absolute path that identifies a temporary - * directory that is read-write-execute accessible to everyone. The - * function first checks to see if the user has specified the [prefix] - * directory on the command line. If so, then the function will use - * that [prefix] if the access permissions are correct, or will return - * an error condition if not - the function will not search for - * alternative locations if the user provides the [prefix] name. - * - * \par If the [prefix] is not provided by the user, the function - * searches for a suitable directory in a specific order, taking the - * first option that meets the access permission requirement, using: - * (a) the "OMPI_PREFIX_ENV" environment variable; (b) the "TMPDIR" - * environment variable; and (c) the "TMP" environment variabley. If - * none of those environmental variables have been defined and/or the - * function was unable to create a suitable directory within any of - * them, then the function tries to use a default location of "/tmp", - * where the "/" represents the top-level directory of the local - * system. If none of these options are successful, the function - * returns an error code. - * - * \par \em [openmpi-sessions]-[user-id]@[host]:[batchid] This serves - * as a concentrator for all PRTE session directories for this - * user on the local system. If it doesn't already exist, this - * directory is created with read-write-execute permissions - * exclusively restricted to the user. If it does exist, the access - * permissions are checked to ensure they are correct - if not, the - * program attempts to correct them. If they can't' be changed to the - * correct values, an error condition is returned. The [host] and - * [batchid] fields are included to provide uniqueness on shared file - * systems and batch schedulers, respectively. - * - * \par Note: The [prefix]/openmpi-sessions-[user-id]@[host]:[batchid] - * directory is left on the system upon termination of an application - * and/or an PRTE universe for future use by the user. Thus, when - * checking a potential location for the directory, the - * prte_session_tree_init() function first checks to see if an - * appropriate directory already exists, and uses it if it does. - * - * \par \em [universe-name] A directory is created for the specified - * universe name. This is the directory that will be used to house all - * information relating to the specific universe. If the directory - * already exists (indicating that the user is joining an existing - * universe), then the function ensures that the user has exclusive - * read-write-execute permissions on the directory. - * - * \par \em [job] A directory is created for the specified job - * name. This will house all information relating to that specific - * job, including directories for each process within that job on this - * host. - * - * \par \em [process] A directory for the specific process, will house - * all information for that process. - * - * \par If \c create is \c true, the directory will be created and the - * proc_info structure will be updated. If proc_info is false, */ #ifndef PRTE_SESSION_DIR_H_HAS_BEEN_INCLUDED @@ -92,60 +29,33 @@ #include "prte_config.h" #include "types.h" +#include "src/runtime/prte_globals.h" BEGIN_C_DECLS -/** @param create A boolean variable that indicates whether or not to - * create the specified directory. If set to "false", - * the function only checks to see if an existing - * directory can be found. This is typically used to - * locate an already existing universe for reconnection - * purposes. If set to "true", then the function - * creates the directory, if possible. - * @param proc Pointer to a process name for which the session - * dir name is desired +/** @param proc Pointer to a process name for which the session + * dir name is desired. Passing: + * + * PRTE_NAME_INVALID - top-level session directory + * will be created. * - * @retval PRTE_SUCCESS The directory was found and/or created with + * PRTE_NAME_WILDCARD - job-level session directory + * will be created + * + * Valid procID - proc-level session directory will + * be created + * + *@retval PRTE_SUCCESS The directory was found and/or created with * the proper permissions. - * @retval OMPI_ERROR The directory cannot be found (if create is - * "false") or created (if create is "true"). + * @retval PRTE_ERROR The directory cannot be found or created */ -PRTE_EXPORT int prte_session_dir(bool create, pmix_proc_t *proc); +PRTE_EXPORT int prte_session_dir(pmix_proc_t *proc); -/* - * Setup session-related directory paths +/** The session_dir_finalize functions perform a cleanup of the + * relevant session directory tree. */ -PRTE_EXPORT int prte_session_setup_base(pmix_proc_t *proc); -PRTE_EXPORT int prte_setup_top_session_dir(void); - -/** The prte_session_dir_finalize() function performs a cleanup of the - * session directory tree. It first removes the session directory for - * the calling process. It then checks to see if the job-level session - * directory is now empty - if so, it removes that level as - * well. Finally, it checks to see if the universe-level session - * directory is now empty - if so, it also removes that level. This - * three-part "last-one-out" procedure ensures that the directory tree - * is properly removed if all processes and applications within a - * universe have completed. - * - * @param None - * @retval PRTE_SUCCESS If the directory tree is properly cleaned up. - * @retval OMPI_ERROR If something prevents the tree from being - * properly cleaned up. - */ -PRTE_EXPORT int prte_session_dir_finalize(pmix_proc_t *proc); - -/** The prte_session_dir_cleanup() function performs a cleanup of the - * session directory tree when a job is aborted. It cleans up all - * process directories for a given job and then backs up the tree. - * - * @param jobid - * @retval OMPI_SUCCESS If the directory tree is properly cleaned up. - * @retval OMPI_ERROR If something prevents the tree from being - * properly cleaned up. - */ -PRTE_EXPORT int prte_session_dir_cleanup(pmix_nspace_t jobid); +PRTE_EXPORT void prte_job_session_dir_finalize(prte_job_t *jdata); END_C_DECLS