Skip to content

Commit

Permalink
Merge pull request #13018 from hjelmn/wip_uct_improvements
Browse files Browse the repository at this point in the history
btl/uct: reduce number of messages sent when establishing connections
  • Loading branch information
hjelmn authored Feb 4, 2025
2 parents 2514b6e + 41ad9f7 commit a1544c0
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 33 deletions.
7 changes: 6 additions & 1 deletion opal/mca/btl/uct/btl_uct.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2015-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Google, LLC. All rights reserved.
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
* Copyright (c) 2019 Intel, Inc. All rights reserved.
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
Expand Down Expand Up @@ -40,6 +40,8 @@
#include "opal/mca/mpool/mpool.h"
#include "opal/mca/pmix/pmix-internal.h"
#include "opal/mca/rcache/base/base.h"
#include "opal/mca/threads/condition.h"
#include "opal/mca/threads/mutex.h"
#include "opal/mca/threads/tsd.h"
#include "opal/util/event.h"
#include <uct/api/uct.h>
Expand Down Expand Up @@ -153,6 +155,9 @@ struct mca_btl_uct_component_t {

/** disable UCX memory hooks */
bool disable_ucx_memory_hooks;

/** connection retry timeout */
unsigned int connection_retry_timeout;
};
typedef struct mca_btl_uct_component_t mca_btl_uct_component_t;

Expand Down
11 changes: 11 additions & 0 deletions opal/mca/btl/uct/btl_uct_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,17 @@ static int mca_btl_uct_component_register(void)
MCA_BASE_VAR_SCOPE_ALL, &mca_btl_uct_component.bind_threads_to_contexts);
#endif

/* timeout between connection message attempts in µs */
mca_btl_uct_component.connection_retry_timeout = 2000;
(void) mca_base_component_var_register(
&mca_btl_uct_component.super.btl_version, "connection_retry_timeout",
"Timeout between attempts to send connection messages for connect-to-"
"endpoint connections. The timeout is measured in µs and is only"
"necessary when using unreliable transports for connections (ex: UD). "
"(default: 2000µs)",
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_uct_component.connection_retry_timeout);

/* for now we want this component to lose to btl/ugni and btl/vader */
module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 1;

Expand Down
82 changes: 50 additions & 32 deletions opal/mca/btl/uct/btl_uct_endpoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
* reserved.
* Copyright (c) 2018 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Google, LLC. All rights reserved.
* Copyright (c) 2019-2025 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -16,6 +16,7 @@
#include "btl_uct.h"
#include "btl_uct_am.h"
#include "btl_uct_device_context.h"
#include "opal/mca/timer/base/base.h"
#include "opal/util/proc.h"

static void mca_btl_uct_endpoint_construct(mca_btl_uct_endpoint_t *endpoint)
Expand Down Expand Up @@ -257,21 +258,17 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
return OPAL_SUCCESS;
}

static int mca_btl_uct_endpoint_connect_endpoint(
static int mca_btl_uct_endpoint_send_connection_data(
mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl,
mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint,
uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr)
uint8_t *conn_tl_data, int request_type)
{
size_t request_length = sizeof(mca_btl_uct_conn_req_t)
+ MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
mca_btl_uct_tl_t *conn_tl = uct_btl->conn_tl;
mca_btl_uct_device_context_t *conn_tl_context = conn_tl->uct_dev_contexts[0];
mca_btl_uct_conn_req_t *request = alloca(request_length);
mca_btl_uct_connection_ep_t *conn_ep = endpoint->conn_ep;
uct_device_addr_t *device_addr = NULL;
uct_iface_addr_t *iface_addr;
ucs_status_t ucs_status;
int rc;

assert(NULL != conn_tl);

Expand Down Expand Up @@ -302,15 +299,50 @@ static int mca_btl_uct_endpoint_connect_endpoint(
ucs_status));
return OPAL_ERROR;
}
} else {
OBJ_RETAIN(conn_ep);
}

size_t request_length = sizeof(mca_btl_uct_conn_req_t)
+ MCA_BTL_UCT_TL_ATTR(tl, tl_context->context_id).ep_addr_len;
mca_btl_uct_conn_req_t *request = alloca(request_length);

/* fill in common request parameters */
request->proc_name = OPAL_PROC_MY_NAME;
request->context_id = tl_context->context_id;
request->tl_index = tl->tl_index;
request->type = !!(ep_addr);
request->type = request_type;

/* fill in connection request */
ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
if (UCS_OK != ucs_status) {
/* this is a fatal a fatal error */
OBJ_RELEASE(endpoint->conn_ep);
uct_ep_destroy(tl_endpoint->uct_ep);
tl_endpoint->uct_ep = NULL;
return OPAL_ERROR;
}

/* let the remote side know that the connection has been established and
* wait for the message to be sent */
int rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request,
request_length);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
OBJ_RELEASE(endpoint->conn_ep);
uct_ep_destroy(tl_endpoint->uct_ep);
tl_endpoint->uct_ep = NULL;
return OPAL_ERROR;
}

tl_endpoint->last_connection_req = opal_timer_base_get_usec();

return OPAL_SUCCESS;
}

static int mca_btl_uct_endpoint_connect_endpoint(
mca_btl_uct_module_t *uct_btl, mca_btl_base_endpoint_t *endpoint, mca_btl_uct_tl_t *tl,
mca_btl_uct_device_context_t *tl_context, mca_btl_uct_tl_endpoint_t *tl_endpoint,
uint8_t *tl_data, uint8_t *conn_tl_data, void *ep_addr)
{
ucs_status_t ucs_status;

if (NULL == tl_endpoint->uct_ep) {
BTL_VERBOSE(("allocating endpoint for peer %s and sending connection data",
Expand Down Expand Up @@ -338,29 +370,15 @@ static int mca_btl_uct_endpoint_connect_endpoint(
}
}

/* fill in connection request */
ucs_status = uct_ep_get_address(tl_endpoint->uct_ep, (uct_ep_addr_t *) request->ep_addr);
if (UCS_OK != ucs_status) {
/* this is a fatal a fatal error */
OBJ_RELEASE(endpoint->conn_ep);
uct_ep_destroy(tl_endpoint->uct_ep);
tl_endpoint->uct_ep = NULL;
return OPAL_ERROR;
}

/* let the remote side know that the connection has been established and
* wait for the message to be sent */
rc = mca_btl_uct_endpoint_send_conn_req(uct_btl, endpoint, conn_tl_context, request,
request_length);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
OBJ_RELEASE(endpoint->conn_ep);
uct_ep_destroy(tl_endpoint->uct_ep);
tl_endpoint->uct_ep = NULL;
return OPAL_ERROR;
opal_timer_t now = opal_timer_base_get_usec();
if ((now - tl_endpoint->last_connection_req) < mca_btl_uct_component.connection_retry_timeout && !ep_addr) {
return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS
: OPAL_ERR_OUT_OF_RESOURCE;
}

return (tl_endpoint->flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY) ? OPAL_SUCCESS
: OPAL_ERR_OUT_OF_RESOURCE;
int rc = mca_btl_uct_endpoint_send_connection_data(uct_btl, endpoint, tl, tl_context, tl_endpoint,
conn_tl_data, /*request_type=*/!!ep_addr);
return (OPAL_SUCCESS == rc) ? OPAL_ERR_OUT_OF_RESOURCE : rc;
}

int mca_btl_uct_endpoint_connect(mca_btl_uct_module_t *uct_btl, mca_btl_uct_endpoint_t *endpoint,
Expand Down
6 changes: 6 additions & 0 deletions opal/mca/btl/uct/btl_uct_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -14,6 +15,8 @@

# include "opal/mca/btl/btl.h"

#include "opal/mca/timer/base/base.h"

/* forward declarations */
struct mca_btl_uct_module_t;
struct mca_btl_base_endpoint_t;
Expand Down Expand Up @@ -100,6 +103,9 @@ struct mca_btl_uct_tl_endpoint_t {

/** UCT endpoint handle */
uct_ep_h uct_ep;

/** Time of last connection message. */
opal_timer_t last_connection_req;
};

typedef struct mca_btl_uct_tl_endpoint_t mca_btl_uct_tl_endpoint_t;
Expand Down

0 comments on commit a1544c0

Please sign in to comment.