Skip to content

Commit

Permalink
btl/uct: correctly set the completion status before using uct complet…
Browse files Browse the repository at this point in the history
…ions

According to UCT documentation the status field must be preset to UCS_OK before
the completion is used. My assumption here is that the field is not filled in in
all cases leaving the value as potentially garbage. This CL addresses the issue
but setting the status in the constructor for both RDMA completions and frags.
The status is then reset on completion callback.

Signed-off-by: Nathan Hjelm <hjelmn@google.com>
  • Loading branch information
hjelmn authored and jsquyres committed Jan 10, 2025
1 parent 0bccfcd commit d586292
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 29 deletions.
29 changes: 2 additions & 27 deletions opal/mca/btl/uct/btl_uct_frag.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -11,29 +12,6 @@

#include "btl_uct_frag.h"

static void mca_btl_uct_frag_completion_compat(uct_completion_t *uct_comp, ucs_status_t status)
{
mca_btl_uct_uct_completion_t *comp = (mca_btl_uct_uct_completion_t
*) ((uintptr_t) uct_comp
- offsetof(mca_btl_uct_uct_completion_t,
uct_comp));

BTL_VERBOSE(("frag operation complete. frag = %p. status = %d", (void *) comp->frag, status));

comp->status = status;
opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super);
}

#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp) {
mca_btl_uct_frag_completion_compat(uct_comp, uct_comp->status);
}
#else
static void mca_btl_uct_frag_completion(uct_completion_t *uct_comp, ucs_status_t status) {
mca_btl_uct_frag_completion_compat(uct_comp, status);
}
#endif

static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag)
{
mca_btl_uct_reg_t *reg = (mca_btl_uct_reg_t *) frag->base.super.registration;
Expand All @@ -42,14 +20,11 @@ static void mca_btl_uct_base_frag_constructor(mca_btl_uct_base_frag_t *frag)
memset((char *) frag + sizeof(frag->base), 0, sizeof(*frag) - sizeof(frag->base));

OBJ_CONSTRUCT(&frag->comp, mca_btl_uct_uct_completion_t);
frag->comp.frag = frag;

frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;

frag->comp.uct_comp.func = mca_btl_uct_frag_completion;
frag->comp.uct_comp.count = 1;
frag->comp.frag = frag;

frag->segments[0].seg_addr.pval = frag->base.super.ptr;
frag->uct_iov.buffer = frag->base.super.ptr;
frag->uct_iov.stride = 0;
Expand Down
16 changes: 14 additions & 2 deletions opal/mca/btl/uct/btl_uct_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -18,15 +19,21 @@ static void mca_btl_uct_uct_completion_compat(uct_completion_t *uct_comp, ucs_st
- offsetof(mca_btl_uct_uct_completion_t,
uct_comp));

BTL_VERBOSE(("network operation complete. status = %d", status));
BTL_VERBOSE(("network operation complete. frag = %p, status = %d", (void *) comp->frag, status));

comp->status = status;
opal_fifo_push(&comp->dev_context->completion_fifo, &comp->super.super);
}

#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp) {
mca_btl_uct_uct_completion_compat(uct_comp, uct_comp->status);
ucs_status_t status = uct_comp->status;
/* reset the status now as the completion can not be accessed after calling
* mca_btl_uct_uct_completion_compat (may get returned) */
uct_comp->status = UCS_OK;

mca_btl_uct_uct_completion_compat(uct_comp, status);
/* do not access the completion struture here */
}
#else
static void mca_btl_uct_uct_completion(uct_completion_t *uct_comp, ucs_status_t status) {
Expand All @@ -38,6 +45,11 @@ static void mca_btl_uct_uct_completion_construct(mca_btl_uct_uct_completion_t *c
{
comp->frag = NULL;
comp->uct_comp.func = mca_btl_uct_uct_completion;
comp.uct_comp.count = 1;

#if UCT_API >= ((1L<<UCT_MAJOR_BIT)|(10L << UCT_MINOR_BIT))
comp->uct_comp->status = UCS_OK;
#endif
}

OBJ_CLASS_INSTANCE(mca_btl_uct_uct_completion_t, opal_free_list_item_t,
Expand Down

0 comments on commit d586292

Please sign in to comment.