Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osc/rdma: Fixes when running with btl/tcp #8756

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions ompi/mca/osc/rdma/osc_rdma_active_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "start group size %d", sync->num_peers);

sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;

if (0 == ompi_group_size (group)) {
/* nothing more to do. this is an empty start epoch */
OPAL_THREAD_UNLOCK(&module->lock);
Expand All @@ -393,8 +395,6 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t

opal_atomic_wmb ();

sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;

/* prevent us from entering a passive-target, fence, or another pscw access epoch until
* the matching complete is called */
sync->epoch_active = true;
Expand Down Expand Up @@ -466,17 +466,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
sync->epoch_active = false;

/* phase 2 cleanup group */
OBJ_RELEASE(group);

peers = sync->peer_list.peers;
if (NULL == peers) {
/* empty peer list */
OPAL_THREAD_UNLOCK(&(module->lock));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would't it be better to release the lock after OBj_RELEASE(group), right before the return statement? Any chance multiple threads would compete to decrement the group's reference count?

Maybe this is totally irrelevant, I do not know this codebase. In that case just ignore my comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that seems reasonable to me. I'll make the change. Thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The user provided group is unnecessary at this point, releasing it as early as possible seems correct.

In fact, the user provided group is only necessary early on while building the peers array. As we refcount each peer proc explicitly, technically we don't even need to keep a pointer to the group, because this leads to double refcounting the peers. From a performance perspective, I would rather keep and refcount only the group than refcounting each individual procs (because the procs are already refcounted by the group). We could save 2*peers atomic operations per epoch.

OBJ_RELEASE(group);
if(MPI_GROUP_EMPTY != group) {
OBJ_RELEASE(group);
}
return OMPI_SUCCESS;
}

/* phase 2 cleanup group */
OBJ_RELEASE(group);

sync->peer_list.peers = NULL;

OPAL_THREAD_UNLOCK(&(module->lock));
Expand Down
38 changes: 18 additions & 20 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
my_peer->state = (uint64_t) (uintptr_t) module->state;

if (module->use_cpu_atomics) {
if (module->use_cpu_atomics || my_peer->rank == my_rank) {
/* all peers are local or it is safe to mix cpu and nic atomics */
my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
} else {
Expand Down Expand Up @@ -581,7 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
int my_rank = ompi_comm_rank (module->comm);
int global_size = ompi_comm_size (module->comm);
ompi_osc_rdma_region_t *state_region;
struct _local_data *temp;
struct _local_data *temp = NULL;
char *data_file;
int page_size = opal_getpagesize();

Expand All @@ -596,7 +596,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s

if (!module->single_node) {
for (int i = 0 ; i < module->btls_in_use ; ++i) {
module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB);
}
}

Expand Down Expand Up @@ -624,13 +624,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
}

do {
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
break;
}
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

do {
temp[local_rank].rank = my_rank;
temp[local_rank].size = size;

Expand Down Expand Up @@ -777,7 +776,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ex_peer = (ompi_osc_rdma_peer_extended_t *) peer;

/* set up peer state */
if (module->use_cpu_atomics) {
if (module->use_cpu_atomics || peer->rank == my_rank) {
/* all peers are local or it is safe to mix cpu and nic atomics */
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE;
peer->state = (osc_rdma_counter_t) peer_state;
Expand All @@ -788,10 +787,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
}
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
if (i > 0) {
peer->state_endpoint = local_leader->state_endpoint;
peer->state_btl_index = local_leader->state_btl_index;
}
peer->state_endpoint = local_leader->data_endpoint; // data_endpoint initialized in ompi_osc_rdma_new_peer();
peer->state_btl_index = local_leader->data_btl_index;
}

if (my_rank == peer_rank) {
Expand All @@ -808,7 +805,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ompi_osc_module_add_peer (module, peer);

if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) {
if (module->use_cpu_atomics && peer_rank == my_rank) {
if (peer_rank == my_rank) {
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
}
/* nothing more to do */
Expand All @@ -824,15 +821,14 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ex_peer->size = temp[i].size;
}

if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) {
if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor)) {
/* base is local and cpu atomics are available */
if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) {
ex_peer->super.base = (uintptr_t) module->segment_base + offset;
} else {
ex_peer->super.base = (uintptr_t) *base;
}

peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
offset += temp[i].size;
} else {
ex_peer->super.base = peer_region->base;
Expand All @@ -841,6 +837,10 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
ex_peer->super.base_handle = (mca_btl_base_registration_handle_t *) peer_region->btl_handle_data;
}
}

if(my_rank == peer_rank) {
peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE;
}
}
} while (0);

Expand Down Expand Up @@ -914,10 +914,8 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
{
mca_btl_base_selected_module_t *item;
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
int btls_found = 0;

btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
if (NULL == btls_to_use) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
return OMPI_ERR_UNREACH;
Expand Down
7 changes: 0 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,6 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module,
}
peer->regions = temp;

/* lock the region */
ompi_osc_rdma_lock_acquire_shared (module, &peer->super, 1, offsetof (ompi_osc_rdma_state_t, regions_lock),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you say a word why you removed the lock? AFAICS, the owner of the region takes an exclusive lock in ompi_osc_rdma_attach and the shared lock here is needed to ensure the consistency of the region information.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing a hang in this code path. From what I can see, because this process already has an exclusive lock on this peer, it can't acquire the shared lock (it thinks someone else owns it), and will loop for infinity trying to acquire this lock. To me this looks like a double-lock scenerio. @hjelmn can probably say if that is intended or not, perhaps there is a different fix needed for this case.

OMPI_OSC_RDMA_LOCK_EXCLUSIVE);

source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions);
ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint,
source_address, peer->super.state_handle, peer->regions, region_len);
Expand All @@ -440,9 +436,6 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module,
return ret;
}

/* release the region lock */
ompi_osc_rdma_lock_release_shared (module, &peer->super, -1, offsetof (ompi_osc_rdma_state_t, regions_lock));

/* update cached region ids */
peer->region_id = region_id;
peer->region_count = region_count;
Expand Down
14 changes: 9 additions & 5 deletions ompi/mca/osc/rdma/osc_rdma_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,12 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t

if (OPAL_SUCCESS != ret) {
if (OPAL_LIKELY(1 == ret)) {
*result = ((int64_t *) pending_op->op_buffer)[0];
ret = OMPI_SUCCESS;
ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer,
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
if(false == pending_op->op_complete) {
*result = ((int64_t *) pending_op->op_buffer)[0];
ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer,
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
}
} else {
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
Expand Down Expand Up @@ -161,8 +163,10 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
if (cbfunc) {
cbfunc (cbdata, cbcontext, OMPI_SUCCESS);
if(false == pending_op->op_complete) {
if (cbfunc) {
cbfunc (cbdata, cbcontext, OMPI_SUCCESS);
}
}
ret = OMPI_SUCCESS;
}
Expand Down
1 change: 1 addition & 0 deletions ompi/mca/osc/rdma/osc_rdma_peer.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul
}
}

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "WARNING - couldn't find a connection");
/* unlikely but can happen when creating a peer for self */
return OMPI_ERR_UNREACH;
}
Expand Down
10 changes: 5 additions & 5 deletions opal/mca/btl/base/btl_base_am_rdma.c
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ static int mca_btl_base_am_rdma_respond(mca_btl_base_module_t *btl,
send_descriptor->des_cbfunc = NULL;

int ret = btl->btl_send(btl, endpoint, send_descriptor, mca_btl_base_rdma_resp_tag());
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
if (OPAL_UNLIKELY(ret < 0)) {
*descriptor = send_descriptor;
}
return ret;
Expand All @@ -635,7 +635,7 @@ mca_btl_base_am_rmda_rdma_complete(mca_btl_base_module_t *btl,
operation->is_completed = true;
int ret = mca_btl_base_am_rdma_respond(operation->btl, operation->endpoint,
&operation->descriptor, NULL, &operation->hdr);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
if (OPAL_UNLIKELY(ret < 0)) {
BTL_VERBOSE(
("could not send a response. queueing the response for later. endpoint=%p, ret=%d",
endpoint, ret));
Expand Down Expand Up @@ -781,7 +781,7 @@ static int mca_btl_base_am_rdma_progress(void)
int ret = descriptor->btl->btl_send(descriptor->btl, descriptor->endpoint,
descriptor->descriptor,
mca_btl_base_rdma_tag(context->type));
if (OPAL_SUCCESS == ret) {
if (ret <= 1) {
opal_list_remove_item(&default_module.queued_initiator_descriptors,
&descriptor->super);
}
Expand Down Expand Up @@ -932,7 +932,7 @@ static void mca_btl_base_am_process_rdma(mca_btl_base_module_t *btl,
abort();
}

if (OPAL_SUCCESS != ret) {
if (ret < 0) {
mca_btl_base_rdma_queue_operation(btl, desc->endpoint, descriptor, 0, hdr, operation);
}
}
Expand Down Expand Up @@ -995,7 +995,7 @@ static void mca_btl_base_am_process_atomic(mca_btl_base_module_t *btl,

mca_btl_base_descriptor_t *descriptor = NULL;
int ret = mca_btl_base_am_rdma_respond(btl, desc->endpoint, &descriptor, &atomic_response, hdr);
if (OPAL_SUCCESS != ret) {
if (ret < 0) {
mca_btl_base_rdma_queue_operation(btl, desc->endpoint, descriptor, atomic_response, hdr,
NULL);
}
Expand Down
Loading