From 95f7141781fd3187612325366f4c276626217260 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Mon, 13 Jan 2025 23:15:36 -0800 Subject: [PATCH] btl/sm: rewrite of fast box (per-peer receive buffers) I was investigating possibly lost btl/sm messages and realized that the code is difficult to follow and it is not always clear what I was attempting to do. It is not clear if there is a problem but the rewrite is worth committing. This change does the following: - Seperate the fast box metadata out from the fast box receive data. These parts are logically separate so there was no need to keep adjusting the offset based on the metadata (start of buffer was offset 64, now 0). - Use modulo-math instead of toggling an extra bit to determine full vs empty. To keep this fast the modulo is done with bitwise-and with a mask and the fast box size has been limited to a power of two. This change simplifies the math and only has one special case to cover ( end overflow-- end less than start). - General cleanup of the code overall to improve readability. Signed-off-by: Nathan Hjelm --- opal/mca/btl/sm/btl_sm_component.c | 11 +- opal/mca/btl/sm/btl_sm_fbox.h | 372 +++++++++++++++-------------- opal/mca/btl/sm/btl_sm_module.c | 4 +- opal/mca/btl/sm/btl_sm_types.h | 38 +-- 4 files changed, 225 insertions(+), 200 deletions(-) diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index 9815adf8d4e..a0ec85e1f40 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -21,7 +21,7 @@ * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2019-2021 Google, Inc. All rights reserved. + * Copyright (c) 2019-2025 Google, Inc. All rights reserved. * Copyright (c) 2021 Nanook Consulting. All rights reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved. * Copyright (c) 2022 Computer Architecture and VLSI Systems (CARV) @@ -36,6 +36,7 @@ #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/threads/mutex.h" +#include "opal/util/bit_ops.h" #include "opal/util/output.h" #include "opal/util/printf.h" @@ -182,7 +183,7 @@ static int mca_btl_sm_component_register(void) mca_btl_sm_component.fbox_size = 4096; (void) mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, "fbox_size", - "Size of per-peer fast transfer buffers (default: 4k)", + "Size of per-peer fast transfer buffers. Must be a power of two (default: 4k)", MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_LOCAL, @@ -324,8 +325,10 @@ mca_btl_sm_component_init(int *num_btls, bool enable_progress_threads, bool enab component->segment_size = (2 << 20); } - component->fbox_size = (component->fbox_size + MCA_BTL_SM_FBOX_ALIGNMENT_MASK) - & ~MCA_BTL_SM_FBOX_ALIGNMENT_MASK; + if (component->fbox_size & (component->fbox_size - 1)) { + BTL_VERBOSE(("fast box size must be a power of two, rounding up to next power of two.")); + component->fbox_size = opal_next_poweroftwo_inclusive(component->fbox_size); + } if (component->segment_size > (1ul << MCA_BTL_SM_OFFSET_BITS)) { component->segment_size = 2ul << MCA_BTL_SM_OFFSET_BITS; diff --git a/opal/mca/btl/sm/btl_sm_fbox.h b/opal/mca/btl/sm/btl_sm_fbox.h index d05806f1046..b81bb951950 100644 --- a/opal/mca/btl/sm/btl_sm_fbox.h +++ b/opal/mca/btl/sm/btl_sm_fbox.h @@ -4,7 +4,7 @@ * reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserveed. + * Copyright (c) 2020-2025 Google, LLC. All rights reserveed. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,11 +17,32 @@ #include "opal/mca/btl/sm/btl_sm_types.h" #include "opal/mca/btl/sm/btl_sm_virtual.h" +#include "opal/util/minmax.h" #define MCA_BTL_SM_POLL_COUNT 31 #define MCA_BTL_SM_FBOX_ALIGNMENT 32 #define MCA_BTL_SM_FBOX_ALIGNMENT_MASK (MCA_BTL_SM_FBOX_ALIGNMENT - 1) +typedef union mca_btl_sm_fbox_hdr_t { + struct { + /* NTH: on 32-bit platforms loading/unloading the header may be completed + * in multiple instructions. To ensure that seq is never loaded before tag + * and the tag is never read before seq put them in the same 32-bits of the + * header. */ + /** message size */ + uint32_t size; + /** message tag */ + uint16_t tag; + /** sequence number */ + uint16_t seq; + } data; + struct { + uint32_t value0; + uint32_t value1; + } data_i32; + uint64_t ival; +} mca_btl_sm_fbox_hdr_t; + /** * An abstraction that represents a connection to a endpoint process. * An instance of mca_ptl_base_endpoint_t is associated w/ each process @@ -31,11 +52,10 @@ static inline void mca_btl_sm_endpoint_setup_fbox_recv(struct mca_btl_base_endpoint_t *endpoint, void *base) { - endpoint->fbox_in.startp = (uint32_t *) base; - endpoint->fbox_in.start = MCA_BTL_SM_FBOX_ALIGNMENT; + endpoint->fbox_in.metadata = (mca_btl_sm_fbox_metadata_t *) base; + endpoint->fbox_in.start = endpoint->fbox_in.metadata->start; endpoint->fbox_in.seq = 0; - opal_atomic_wmb(); - endpoint->fbox_in.buffer = base; + endpoint->fbox_in.buffer = (unsigned char *)(endpoint->fbox_in.metadata + 1); } static inline void mca_btl_sm_endpoint_setup_fbox_send(struct mca_btl_base_endpoint_t *endpoint, @@ -43,52 +63,40 @@ static inline void mca_btl_sm_endpoint_setup_fbox_send(struct mca_btl_base_endpo { void *base = fbox->ptr; - endpoint->fbox_out.start = MCA_BTL_SM_FBOX_ALIGNMENT; - endpoint->fbox_out.end = MCA_BTL_SM_FBOX_ALIGNMENT; - endpoint->fbox_out.startp = (uint32_t *) base; - endpoint->fbox_out.startp[0] = MCA_BTL_SM_FBOX_ALIGNMENT; + endpoint->fbox_out.start = 0; + endpoint->fbox_out.end = 0; endpoint->fbox_out.seq = 0; endpoint->fbox_out.fbox = fbox; + endpoint->fbox_out.metadata = (mca_btl_sm_fbox_metadata_t *) base; + endpoint->fbox_out.metadata->start = 0; + + endpoint->fbox_out.buffer = (unsigned char *)(endpoint->fbox_out.metadata + 1); + /* zero out the first header in the fast box */ - memset((char *) base + MCA_BTL_SM_FBOX_ALIGNMENT, 0, MCA_BTL_SM_FBOX_ALIGNMENT); + ((mca_btl_sm_fbox_hdr_t *)endpoint->fbox_out.buffer)->ival = 0; opal_atomic_wmb(); - endpoint->fbox_out.buffer = base; } -typedef union mca_btl_sm_fbox_hdr_t { - struct { - /* NTH: on 32-bit platforms loading/unloading the header may be completed - * in multiple instructions. To ensure that seq is never loaded before tag - * and the tag is never read before seq put them in the same 32-bits of the - * header. */ - /** message size */ - uint32_t size; - /** message tag */ - uint16_t tag; - /** sequence number */ - uint16_t seq; - } data; - struct { - uint32_t value0; - uint32_t value1; - } data_i32; - uint64_t ival; -} mca_btl_sm_fbox_hdr_t; - #define MCA_BTL_SM_FBOX_HDR(x) ((mca_btl_sm_fbox_hdr_t *) (x)) -#define MCA_BTL_SM_FBOX_OFFSET_MASK 0x7fffffff -#define MCA_BTL_SM_FBOX_HB_MASK 0x80000000 - -/* if the two offsets are equal and the high bit matches the buffer is empty else the buffer is - * full. note that start will never be end - 1 so this simplified conditional will always produce - * the correct result */ -#define BUFFER_FREE(s, e, hbm, size) (((s + !hbm) > (e)) ? (s) - (e) : (size - (e))) +static inline unsigned int mca_btl_sm_fbox_out_free(mca_btl_sm_fbox_out_t *fbox_out, unsigned int fbox_size) { + unsigned int fbox_offset_mask = fbox_size - 1; + unsigned int masked_end = fbox_out->end & fbox_offset_mask; + unsigned int free; + + /* start pointer will always trail the end pointer. check for rollover of the end pointer where + * the start pointer has not yet rolled over. */ + if (OPAL_UNLIKELY(fbox_out->end <= fbox_size && fbox_out->start > fbox_size)) { + unsigned int masked_start = fbox_out->start & fbox_offset_mask; + free = masked_start - masked_end; + } else { + free = fbox_size - (fbox_out->end - fbox_out->start); + } -/** macro for checking if the high bit is set */ -#define MCA_BTL_SM_FBOX_OFFSET_HBS(v) (!!((v) &MCA_BTL_SM_FBOX_HB_MASK)) + return opal_min(free, fbox_size - masked_end); +} void mca_btl_sm_poll_handle_frag(mca_btl_sm_hdr_t *hdr, mca_btl_base_endpoint_t *endpoint); @@ -96,105 +104,125 @@ static inline void mca_btl_sm_fbox_set_header(mca_btl_sm_fbox_hdr_t *hdr, uint16 uint16_t seq, uint32_t size) { mca_btl_sm_fbox_hdr_t tmp = {.data = {.tag = tag, .seq = seq, .size = size}}; - /* clear out existing tag/seq */ - hdr->data_i32.value1 = 0; - opal_atomic_wmb(); - hdr->data_i32.value0 = size; opal_atomic_wmb(); - hdr->data_i32.value1 = tmp.data_i32.value1; + hdr->ival = tmp.ival; } static inline mca_btl_sm_fbox_hdr_t mca_btl_sm_fbox_read_header(mca_btl_sm_fbox_hdr_t *hdr) { mca_btl_sm_fbox_hdr_t tmp = {.data_i32 = {.value1 = hdr->data_i32.value1}}; - ; + opal_atomic_rmb(); tmp.data_i32.value0 = hdr->data_i32.value0; return tmp; } -/* attempt to reserve a contiguous segment from the remote ep */ -static inline bool mca_btl_sm_fbox_sendi(mca_btl_base_endpoint_t *ep, unsigned char tag, - void *restrict header, const size_t header_size, - void *restrict payload, const size_t payload_size) -{ +static inline unsigned int mca_btl_sm_fbox_align(unsigned int size) { + return (size + MCA_BTL_SM_FBOX_ALIGNMENT_MASK) & ~MCA_BTL_SM_FBOX_ALIGNMENT_MASK; +} + +static inline unsigned char *mca_btl_sm_fbox_reserve_locked(mca_btl_base_endpoint_t *ep, unsigned int data_size) { const unsigned int fbox_size = mca_btl_sm_component.fbox_size; - size_t size = header_size + payload_size; - unsigned int start, end, buffer_free; - size_t data_size = size; - unsigned char *dst, *data; - bool hbs, hbm; + const unsigned int fbox_offset_mask = fbox_size - 1; + unsigned int buffer_free; + unsigned char *dst; + size_t aligned_entry_size; /* don't try to use the per-peer buffer for messages that will fill up more than 25% of the * buffer */ - if (OPAL_UNLIKELY(NULL == ep->fbox_out.buffer || size > (fbox_size >> 2))) { - return false; + if (OPAL_UNLIKELY(NULL == ep->fbox_out.buffer || data_size > (fbox_size >> 2))) { + return NULL; } - OPAL_THREAD_LOCK(&ep->lock); - - /* the high bit helps determine if the buffer is empty or full */ - hbs = MCA_BTL_SM_FBOX_OFFSET_HBS(ep->fbox_out.end); - hbm = MCA_BTL_SM_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs; + assert ((fbox_size & fbox_offset_mask) == 0); - /* read current start and end offsets and check for free space */ - start = ep->fbox_out.start & MCA_BTL_SM_FBOX_OFFSET_MASK; - end = ep->fbox_out.end & MCA_BTL_SM_FBOX_OFFSET_MASK; - buffer_free = BUFFER_FREE(start, end, hbm, fbox_size); + buffer_free = mca_btl_sm_fbox_out_free(&ep->fbox_out, fbox_size); /* need space for the fragment + the header */ - size = (size + sizeof(mca_btl_sm_fbox_hdr_t) + MCA_BTL_SM_FBOX_ALIGNMENT_MASK) - & ~MCA_BTL_SM_FBOX_ALIGNMENT_MASK; + aligned_entry_size = mca_btl_sm_fbox_align(data_size + sizeof(mca_btl_sm_fbox_hdr_t)); - dst = ep->fbox_out.buffer + end; + dst = ep->fbox_out.buffer + (ep->fbox_out.end & fbox_offset_mask); - if (OPAL_UNLIKELY(buffer_free < size)) { + if (OPAL_UNLIKELY(buffer_free < aligned_entry_size)) { /* check if we need to free up space for this fragment */ BTL_VERBOSE(("not enough room for a fragment of size %u. in use buffer segment: {start: " - "%x, end: %x, high bit matches: %d}", - (unsigned) size, start, end, (int) hbm)); + "%x, end: %x}", + (unsigned) aligned_entry_size, ep->fbox_out.start, ep->fbox_out.end)); /* read the current start pointer from the remote peer and recalculate the available buffer * space */ - start = ep->fbox_out.start = ep->fbox_out.startp[0]; - - /* recalculate how much buffer space is available */ - start &= MCA_BTL_SM_FBOX_OFFSET_MASK; - hbm = MCA_BTL_SM_FBOX_OFFSET_HBS(ep->fbox_out.start) == hbs; - buffer_free = BUFFER_FREE(start, end, hbm, fbox_size); - + ep->fbox_out.start = ep->fbox_out.metadata->start; opal_atomic_rmb(); + buffer_free = mca_btl_sm_fbox_out_free(&ep->fbox_out, fbox_size); + /* if this is the end of the buffer and the fragment doesn't fit then mark the remaining * buffer space to be skipped and check if the fragment can be written at the beginning of * the buffer. */ - if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < size && start <= end)) { - BTL_VERBOSE(("message will not fit in remaining buffer space. skipping to beginning")); + if (OPAL_UNLIKELY(buffer_free > 0 && buffer_free < aligned_entry_size && + ((ep->fbox_out.end + buffer_free) & fbox_offset_mask) == 0)) { +#if OPAL_ENABLE_DEBUG + unsigned int old_end = ep->fbox_out.end; +#endif + unsigned int remaining = buffer_free; + + BTL_VERBOSE(("space needed for message: %" PRIsize_t ", remaining space in buffer: " + "%u, checking for space at beginning of buffer", + aligned_entry_size, remaining)); + + ep->fbox_out.end += remaining; + buffer_free = mca_btl_sm_fbox_out_free(&ep->fbox_out, fbox_size); + if (OPAL_UNLIKELY(buffer_free < aligned_entry_size)) { + /* not writing the skip token so give this space back */ + ep->fbox_out.end -= remaining; + return NULL; + } + + MCA_BTL_SM_FBOX_HDR(ep->fbox_out.buffer)->ival = 0; + opal_atomic_wmb(); + BTL_VERBOSE(("writing a skip token at offset %u", old_end)); + /* space is available. go ahead and mark remaining space to skip */ mca_btl_sm_fbox_set_header(MCA_BTL_SM_FBOX_HDR(dst), 0xff, ep->fbox_out.seq++, - buffer_free - sizeof(mca_btl_sm_fbox_hdr_t)); - - end = MCA_BTL_SM_FBOX_ALIGNMENT; - /* toggle the high bit */ - hbs = !hbs; - /* toggle the high bit match */ - buffer_free = BUFFER_FREE(start, end, !hbm, fbox_size); - dst = ep->fbox_out.buffer + end; + remaining - sizeof(mca_btl_sm_fbox_hdr_t)); + dst = ep->fbox_out.buffer; } - if (OPAL_UNLIKELY(buffer_free < size)) { - ep->fbox_out.end = (hbs << 31) | end; - opal_atomic_wmb(); - OPAL_THREAD_UNLOCK(&ep->lock); - return false; + if (buffer_free < aligned_entry_size) { + return NULL; } } - BTL_VERBOSE(("writing fragment of size %u to offset %u {start: 0x%x, end: 0x%x (hbs: %d)} of " + BTL_VERBOSE(("writing fragment of size %u {start: 0x%x, end: 0x%x} of " "peer's buffer. free = %u", - (unsigned int) size, end, start, end, hbs, buffer_free)); + (unsigned int) aligned_entry_size, ep->fbox_out.start, ep->fbox_out.end, buffer_free)); + + ep->fbox_out.end += aligned_entry_size; + + /* zero-out the next */ + if (buffer_free > aligned_entry_size) { + MCA_BTL_SM_FBOX_HDR(ep->fbox_out.buffer + (ep->fbox_out.end & fbox_offset_mask))->ival = 0; + opal_atomic_wmb(); + } + + return dst; +} + +/* attempt to reserve a contiguous segment from the remote ep */ +static inline bool mca_btl_sm_fbox_sendi(mca_btl_base_endpoint_t *ep, unsigned char tag, + void *restrict header, const size_t header_size, + void *restrict payload, const size_t payload_size) +{ + size_t data_size = header_size + payload_size; + + OPAL_THREAD_LOCK(&ep->lock); + unsigned char *dst = mca_btl_sm_fbox_reserve_locked(ep, (unsigned int) data_size); + OPAL_THREAD_UNLOCK(&ep->lock); + if (OPAL_UNLIKELY(NULL == dst)) { + return false; + } - data = dst + sizeof(mca_btl_sm_fbox_hdr_t); + unsigned char *data = dst + sizeof(mca_btl_sm_fbox_hdr_t); memcpy(data, header, header_size); if (payload) { @@ -202,110 +230,94 @@ static inline bool mca_btl_sm_fbox_sendi(mca_btl_base_endpoint_t *ep, unsigned c memcpy(data + header_size, payload, payload_size); } - end += size; + opal_atomic_wmb(); + /* write out part of the header now. the tag will be written when the data is available */ + mca_btl_sm_fbox_set_header(MCA_BTL_SM_FBOX_HDR(dst), tag, ep->fbox_out.seq++, + (uint32_t) data_size); - if (OPAL_UNLIKELY(fbox_size == end)) { - /* toggle the high bit */ - hbs = !hbs; - /* reset the end pointer to the beginning of the buffer */ - end = MCA_BTL_SM_FBOX_ALIGNMENT; - } else if (buffer_free > size) { - MCA_BTL_SM_FBOX_HDR(ep->fbox_out.buffer + end)->ival = 0; + return true; +} + +static inline bool mca_btl_sm_poll_fbox(mca_btl_base_endpoint_t *ep) +{ + const unsigned int fbox_offset_mask = mca_btl_sm_component.fbox_size - 1; + unsigned int start_offset = ep->fbox_in.start & fbox_offset_mask; + const mca_btl_sm_fbox_hdr_t hdr = mca_btl_sm_fbox_read_header( + MCA_BTL_SM_FBOX_HDR(ep->fbox_in.buffer + start_offset)); + + /* check for a valid tag a sequence number */ + if (0 == hdr.data.tag || hdr.data.seq != ep->fbox_in.seq) { + return false; } - /* write out part of the header now. the tag will be written when the data is available */ - mca_btl_sm_fbox_set_header(MCA_BTL_SM_FBOX_HDR(dst), tag, ep->fbox_out.seq++, data_size); + ++ep->fbox_in.seq; - /* align the buffer */ - ep->fbox_out.end = ((uint32_t) hbs << 31) | end; - opal_atomic_wmb(); - OPAL_THREAD_UNLOCK(&ep->lock); + /* force all prior reads to complete before continuing */ + opal_atomic_rmb(); + + BTL_VERBOSE( + ("got frag from %d with header {.tag = %d, .size = %d, .seq = %u} from offset %u", + ep->peer_smp_rank, hdr.data.tag, hdr.data.size, hdr.data.seq, start_offset)); + + /* the 0xff tag indicates we should skip the rest of the buffer */ + if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) { + mca_btl_base_segment_t segment; + const mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger + + hdr.data.tag; + mca_btl_base_receive_descriptor_t desc = {.endpoint = ep, + .des_segments = &segment, + .des_segment_count = 1, + .tag = hdr.data.tag, + .cbdata = reg->cbdata}; + + /* fragment fits entirely in the remaining buffer space. some + * btl users do not handle fragmented data so we can't split + * the fragment without introducing another copy here. this + * limitation has not appeared to cause any performance + * degradation. */ + segment.seg_len = hdr.data.size; + segment.seg_addr.pval = (void *) (ep->fbox_in.buffer + start_offset + sizeof(hdr)); + + /* call the registered callback function */ + reg->cbfunc(&mca_btl_sm.super, &desc); + } else if (OPAL_LIKELY(0xfe == hdr.data.tag)) { + /* process fragment header */ + fifo_value_t *value = (fifo_value_t *) (ep->fbox_in.buffer + start_offset + sizeof(hdr)); + mca_btl_sm_hdr_t *sm_hdr = relative2virtual(*value); + mca_btl_sm_poll_handle_frag(sm_hdr, ep); + } + + ep->fbox_in.start += mca_btl_sm_fbox_align(hdr.data.size + sizeof(hdr)); return true; } -static inline bool mca_btl_sm_check_fboxes(void) +static inline int mca_btl_sm_check_fboxes(void) { - const unsigned int fbox_size = mca_btl_sm_component.fbox_size; - bool processed = false; + int total_processed = 0; for (unsigned int i = 0; i < mca_btl_sm_component.num_fbox_in_endpoints; ++i) { mca_btl_base_endpoint_t *ep = mca_btl_sm_component.fbox_in_endpoints[i]; - unsigned int start = ep->fbox_in.start & MCA_BTL_SM_FBOX_OFFSET_MASK; - /* save the current high bit state */ - bool hbs = MCA_BTL_SM_FBOX_OFFSET_HBS(ep->fbox_in.start); - int poll_count; - - for (poll_count = 0; poll_count <= MCA_BTL_SM_POLL_COUNT; ++poll_count) { - const mca_btl_sm_fbox_hdr_t hdr = mca_btl_sm_fbox_read_header( - MCA_BTL_SM_FBOX_HDR(ep->fbox_in.buffer + start)); - - /* check for a valid tag a sequence number */ - if (0 == hdr.data.tag || hdr.data.seq != ep->fbox_in.seq) { + int frag_count = 0; + for (int j = 0 ; j < MCA_BTL_SM_POLL_COUNT ; ++j) { + if (!mca_btl_sm_poll_fbox(ep)) { break; } - - ++ep->fbox_in.seq; - - /* force all prior reads to complete before continuing */ - opal_atomic_rmb(); - - BTL_VERBOSE( - ("got frag from %d with header {.tag = %d, .size = %d, .seq = %u} from offset %u", - ep->peer_smp_rank, hdr.data.tag, hdr.data.size, hdr.data.seq, start)); - - /* the 0xff tag indicates we should skip the rest of the buffer */ - if (OPAL_LIKELY((0xfe & hdr.data.tag) != 0xfe)) { - mca_btl_base_segment_t segment; - const mca_btl_active_message_callback_t *reg = mca_btl_base_active_message_trigger - + hdr.data.tag; - mca_btl_base_receive_descriptor_t desc = {.endpoint = ep, - .des_segments = &segment, - .des_segment_count = 1, - .tag = hdr.data.tag, - .cbdata = reg->cbdata}; - - /* fragment fits entirely in the remaining buffer space. some - * btl users do not handle fragmented data so we can't split - * the fragment without introducing another copy here. this - * limitation has not appeared to cause any performance - * degradation. */ - segment.seg_len = hdr.data.size; - segment.seg_addr.pval = (void *) (ep->fbox_in.buffer + start + sizeof(hdr)); - - /* call the registered callback function */ - reg->cbfunc(&mca_btl_sm.super, &desc); - } else if (OPAL_LIKELY(0xfe == hdr.data.tag)) { - /* process fragment header */ - fifo_value_t *value = (fifo_value_t *) (ep->fbox_in.buffer + start + sizeof(hdr)); - mca_btl_sm_hdr_t *sm_hdr = relative2virtual(*value); - mca_btl_sm_poll_handle_frag(sm_hdr, ep); - } - - start = (start + hdr.data.size + sizeof(hdr) + MCA_BTL_SM_FBOX_ALIGNMENT_MASK) - & ~MCA_BTL_SM_FBOX_ALIGNMENT_MASK; - if (OPAL_UNLIKELY(fbox_size == start)) { - /* jump to the beginning of the buffer */ - start = MCA_BTL_SM_FBOX_ALIGNMENT; - /* toggle the high bit */ - hbs = !hbs; - } + ++frag_count; } - if (poll_count) { - BTL_VERBOSE(("left off at offset %u (hbs: %d)", start, hbs)); + if (frag_count) { + BTL_VERBOSE(("finished processing at offset %x", ep->fbox_in.start)); - /* save where we left off */ /* let the sender know where we stopped */ opal_atomic_mb(); - ep->fbox_in.start = ep->fbox_in.startp[0] = ((uint32_t) hbs << 31) | start; - processed = true; + ep->fbox_in.metadata->start = ep->fbox_in.start; + total_processed += frag_count; } } - /* return the number of fragments processed */ - return processed; + return total_processed; } static inline void mca_btl_sm_try_fbox_setup(mca_btl_base_endpoint_t *ep, mca_btl_sm_hdr_t *hdr) @@ -326,7 +338,7 @@ static inline void mca_btl_sm_try_fbox_setup(mca_btl_base_endpoint_t *ep, mca_bt mca_btl_sm_endpoint_setup_fbox_send(ep, fbox); hdr->flags |= MCA_BTL_SM_FLAG_SETUP_FBOX; - hdr->fbox_base = virtual2relative((char *) ep->fbox_out.buffer); + hdr->fbox_base = virtual2relative((char *) ep->fbox_out.metadata); } else { opal_atomic_add_fetch_32(&ep->fifo->fbox_available, 1); } diff --git a/opal/mca/btl/sm/btl_sm_module.c b/opal/mca/btl/sm/btl_sm_module.c index c07a82c960b..3cb69c8d3da 100644 --- a/opal/mca/btl/sm/btl_sm_module.c +++ b/opal/mca/btl/sm/btl_sm_module.c @@ -91,8 +91,10 @@ static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int n) return OPAL_ERR_OUT_OF_RESOURCE; } + /* Fast box buffers are prepended with a metadata section. */ rc = opal_free_list_init(&component->sm_fboxes, sizeof(opal_free_list_item_t), 8, - OBJ_CLASS(opal_free_list_item_t), mca_btl_sm_component.fbox_size, + OBJ_CLASS(opal_free_list_item_t), mca_btl_sm_component.fbox_size + + sizeof (mca_btl_sm_fbox_metadata_t), opal_cache_line_size, 0, mca_btl_sm_component.fbox_max, 4, component->mpool, 0, NULL, NULL, NULL); if (OPAL_SUCCESS != rc) { diff --git a/opal/mca/btl/sm/btl_sm_types.h b/opal/mca/btl/sm/btl_sm_types.h index 51b14446e20..35953ca7b34 100644 --- a/opal/mca/btl/sm/btl_sm_types.h +++ b/opal/mca/btl/sm/btl_sm_types.h @@ -17,7 +17,7 @@ * Copyright (c) 2015 Mellanox Technologies. All rights reserved. * Copyright (c) 2018 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2020 Google, LLC. All rights reserved. + * Copyright (c) 2020-2025 Google, LLC. All rights reserved. * * Copyright (c) 2019-2020 Intel, Inc. All rights reserved. * $COPYRIGHT$ @@ -49,24 +49,32 @@ struct mca_btl_sm_modex_t { typedef struct mca_btl_sm_modex_t mca_btl_sm_modex_t; +typedef struct mca_btl_sm_fbox_metadata { + uint32_t start; + uint8_t padding[26]; +} mca_btl_sm_fbox_metadata_t; + +typedef struct mca_btl_sm_fbox_out { + unsigned char *buffer; /**< starting address of peer's fast box in */ + mca_btl_sm_fbox_metadata_t *metadata; + unsigned int start, end; + uint16_t seq; + opal_free_list_item_t *fbox; /**< fast-box free list item */ +} mca_btl_sm_fbox_out_t; + +typedef struct mca_btl_sm_fbox_in { + unsigned char *buffer; /**< starting address of peer's fast box out */ + mca_btl_sm_fbox_metadata_t *metadata; + unsigned int start; + uint16_t seq; +} mca_btl_sm_fbox_in_t; + typedef struct mca_btl_base_endpoint_t { opal_list_item_t super; /* per peer buffers */ - struct { - unsigned char *buffer; /**< starting address of peer's fast box out */ - uint32_t *startp; - unsigned int start; - uint16_t seq; - } fbox_in; - - struct { - unsigned char *buffer; /**< starting address of peer's fast box in */ - uint32_t *startp; /**< pointer to location storing start offset */ - unsigned int start, end; - uint16_t seq; - opal_free_list_item_t *fbox; /**< fast-box free list item */ - } fbox_out; + mca_btl_sm_fbox_in_t fbox_in; + mca_btl_sm_fbox_out_t fbox_out; uint16_t peer_smp_rank; /**< my peer's SMP process rank. Used for accessing * SMP specific data structures. */