From 6d360096aaad98a2c134824dc3c4983b9d0a1d43 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 6 Jan 2025 14:33:18 -0700 Subject: [PATCH 1/2] btl/ofi: fixes for multi mpi init/fini scenarios The OFI BTL was caching a context -in the case of normal OFI EPs - which needed across multiple sessions init/finalize and resulting in segfaults. Also, there were some symbols being exported which should not have been. Related to #13019 Signed-off-by: Howard Pritchard (cherry picked from commit a03199c69973ab92e90f360a4e6b7010c4c58534) --- opal/mca/btl/ofi/btl_ofi_context.c | 7 +++++-- opal/mca/btl/ofi/btl_ofi_endpoint.h | 8 ++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/opal/mca/btl/ofi/btl_ofi_context.c b/opal/mca/btl/ofi/btl_ofi_context.c index 8d387c5fa0e..2b9a5fb6905 100644 --- a/opal/mca/btl/ofi/btl_ofi_context.c +++ b/opal/mca/btl/ofi/btl_ofi_context.c @@ -2,6 +2,8 @@ /* * $COPYRIGHT$ * Copyright (c) 2018 Intel Inc. All rights reserved + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -14,10 +16,10 @@ #include "btl_ofi_rdma.h" #if OPAL_HAVE_THREAD_LOCAL -opal_thread_local mca_btl_ofi_context_t *my_context = NULL; +static opal_thread_local mca_btl_ofi_context_t *my_context = NULL; #endif /* OPAL_HAVE_THREAD_LOCAL */ -int init_context_freelists(mca_btl_ofi_context_t *context) +static int init_context_freelists(mca_btl_ofi_context_t *context) { int rc; OBJ_CONSTRUCT(&context->rdma_comp_list, opal_free_list_t); @@ -113,6 +115,7 @@ mca_btl_ofi_context_t *mca_btl_ofi_context_alloc_normal(struct fi_info *info, context->tx_ctx = ep; context->rx_ctx = ep; context->context_id = 0; + my_context = NULL; return context; diff --git a/opal/mca/btl/ofi/btl_ofi_endpoint.h b/opal/mca/btl/ofi/btl_ofi_endpoint.h index e4bd17eb264..f6b420273af 100644 --- a/opal/mca/btl/ofi/btl_ofi_endpoint.h +++ b/opal/mca/btl/ofi/btl_ofi_endpoint.h @@ -15,6 +15,8 @@ * Copyright (c) 2018 Intel, Inc, All rights reserved * Copyright (c) 2020 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,10 +34,6 @@ BEGIN_C_DECLS -#if OPAL_HAVE_THREAD_LOCAL -extern opal_thread_local mca_btl_ofi_context_t *my_context; -#endif /* OPAL_HAVE_THREAD_LOCAL */ - struct mca_btl_base_endpoint_t { opal_list_item_t super; @@ -53,8 +51,6 @@ typedef struct mca_btl_base_endpoint_t mca_btl_base_endpoint_t; typedef mca_btl_base_endpoint_t mca_btl_ofi_endpoint_t; OBJ_CLASS_DECLARATION(mca_btl_ofi_endpoint_t); -int init_context_freelists(mca_btl_ofi_context_t *context); - mca_btl_base_endpoint_t *mca_btl_ofi_endpoint_create(opal_proc_t *proc, struct fid_ep *ep); /* contexts */ From ed9ab731bdedfe6d94640d32d36ebc5c8a0ea98f Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Mon, 6 Jan 2025 16:37:00 -0700 Subject: [PATCH 2/2] patcher: make it work better with sessions model Turns out that when Open MPI is configured with --enable-mca-dso and is using the OFI MTL/BTL/common, a problem is brought out with the patcher framework the second time through closing the bml and hence btl frameworks. See issue #13021. This patch fixes this problem. Signed-off-by: Howard Pritchard (cherry picked from commit 860bbd6a1141c0a8a7f184db8d7a7333c8b6f271) --- opal/mca/memory/patcher/memory_patcher_component.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/opal/mca/memory/patcher/memory_patcher_component.c b/opal/mca/memory/patcher/memory_patcher_component.c index f1321ba1ab8..02ddbc7a6e5 100644 --- a/opal/mca/memory/patcher/memory_patcher_component.c +++ b/opal/mca/memory/patcher/memory_patcher_component.c @@ -16,6 +16,7 @@ * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2020 IBM Corporation. All rights reserved. + * Copyright (c) 2025 Triad National Security, LLC. All rights reserved. * * $COPYRIGHT$ * @@ -73,6 +74,7 @@ static int patcher_register(void); static int patcher_query(int *); static int mca_memory_patcher_priority; +static int was_executed_already = 0; opal_memory_patcher_component_t mca_memory_patcher_component = { .super = @@ -585,7 +587,6 @@ static int patcher_query(int *priority) static int patcher_open(void) { - static int was_executed_already = 0; int rc; if (was_executed_already) { @@ -678,6 +679,8 @@ static int patcher_close(void) { mca_base_framework_close(&opal_patcher_base_framework); + was_executed_already = 0; + /* Note that we don't need to unpatch any symbols here; the patcher framework will take care of all of that for us. */ return OPAL_SUCCESS;