Skip to content

Commit

Permalink
Parallel rank0 deadlock fixes (HDFGroup#1183)
Browse files Browse the repository at this point in the history
* Fix several places where rank 0 can skip past collective MPI operations on failure

* Committing clang-format changes

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
jhendersonHDF and github-actions[bot] committed Mar 25, 2022
1 parent 3742a45 commit f6efaf8
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 67 deletions.
12 changes: 12 additions & 0 deletions release_docs/RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,18 @@ Bug Fixes since HDF5-1.12.1 release

(JTH - 2021/11/16, HDFFV-10501/HDFFV-10562)

- Fixed several potential MPI deadlocks in library failure conditions

In the parallel library, there were several places where MPI rank 0
could end up skipping past collective MPI operations when some failure
occurs in rank 0-specific processing. This would lead to deadlocks
where rank 0 completes an operation while other ranks wait in the
collective operation. These places have been rewritten to have rank 0
push an error and try to cleanup after the failure, then continue to
participate in the collective operation to the best of its ability.

(JTH - 2021/11/09)

- Fixed an issue with collective metadata reads being permanently disabled
after a dataset chunk lookup operation. This would usually cause a
mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for
Expand Down
11 changes: 8 additions & 3 deletions src/H5AC.c
Original file line number Diff line number Diff line change
Expand Up @@ -1666,9 +1666,14 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing, un
if (H5AC__log_dirtied_entry((H5AC_info_t *)thing) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "can't log dirtied entry")

if (deleted && aux_ptr->mpi_rank == 0)
if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
if (deleted && aux_ptr->mpi_rank == 0) {
if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) {
/* If we fail to log the deleted entry, push an error but still
* participate in a possible sync point ahead
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
}
}
} /* end if */
#endif /* H5_HAVE_PARALLEL */

Expand Down
97 changes: 61 additions & 36 deletions src/H5ACmpio.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,10 @@ H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr, had
* are used to receiving from process 0, and also load it
* into a buffer for transmission.
*/
if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
}
HDassert(chk_num_entries == num_entries);
HDassert(haddr_buf_ptr != NULL);

Expand Down Expand Up @@ -429,18 +431,23 @@ H5AC__broadcast_clean_list(H5AC_t *cache_ptr)

/* allocate a buffer to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")

/* Set up user data for callback */
udata.aux_ptr = aux_ptr;
udata.addr_buf_ptr = addr_buf_ptr;
udata.u = 0;

/* Free all the clean list entries, building the address list in the callback */
/* (Callback also removes the matching entries from the dirtied list) */
if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
}
else {
/* Set up user data for callback */
udata.aux_ptr = aux_ptr;
udata.addr_buf_ptr = addr_buf_ptr;
udata.u = 0;

/* Free all the clean list entries, building the address list in the callback */
/* (Callback also removes the matching entries from the dirtied list) */
if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
}
}

/* Now broadcast the list of cleaned entries */
if (MPI_SUCCESS !=
Expand Down Expand Up @@ -1449,8 +1456,10 @@ H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr, haddr_t *

/* allocate buffers to store the list of entry base addresses in */
buf_size = sizeof(haddr_t) * num_entries;
if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
/* Push an error, but still participate in following MPI_Bcast */
HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
}

/* Now receive the list of candidate entries */
if (MPI_SUCCESS !=
Expand Down Expand Up @@ -1801,10 +1810,14 @@ H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f)

if (evictions_enabled) {
/* construct candidate list -- process 0 only */
if (aux_ptr->mpi_rank == 0)
if (aux_ptr->mpi_rank == 0) {
/* If constructing candidate list fails, push an error but still participate
* in collective operations during following candidate list propagation
*/
if (H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) <
0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
}

/* propagate and apply candidate list -- all processes */
if (H5AC__propagate_and_apply_candidate_list(f) < 0)
Expand Down Expand Up @@ -1900,15 +1913,21 @@ H5AC__rsp__p0_only__flush(H5F_t *f)
aux_ptr->write_permitted = FALSE;

/* Check for error on the write operation */
if (result < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")

/* this code exists primarily for the test bed -- it allows us to
* enforce POSIX semantics on the server that pretends to be a
* file system in our parallel tests.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
if (result < 0) {
/* If write operation fails, push an error but still participate
* in collective operations during following cache entry
* propagation
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
}
else {
/* this code exists primarily for the test bed -- it allows us to
* enforce POSIX semantics on the server that pretends to be a
* file system in our parallel tests.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
}
} /* end if */

/* Propagate cleaned entries to other ranks. */
Expand Down Expand Up @@ -2020,15 +2039,21 @@ H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f)
aux_ptr->write_permitted = FALSE;

/* Check for error on the write operation */
if (result < 0)
HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")

/* this call exists primarily for the test code -- it is used
* to enforce POSIX semantics on the process used to simulate
* reads and writes in t_cache.c.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
if (result < 0) {
/* If write operation fails, push an error but still participate
* in collective operations during following cache entry
* propagation
*/
HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
}
else {
/* this call exists primarily for the test code -- it is used
* to enforce POSIX semantics on the process used to simulate
* reads and writes in t_cache.c.
*/
if (aux_ptr->write_done)
(aux_ptr->write_done)();
}
} /* end if */

if (H5AC__propagate_flushed_and_still_clean_entries_list(f) < 0)
Expand Down
42 changes: 35 additions & 7 deletions src/H5C.c
Original file line number Diff line number Diff line change
Expand Up @@ -2319,9 +2319,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE,
H5C_IMAGE_EXTRA_SPACE);
#endif /* H5C_DO_MEMORY_SANITY_CHECKS */
if (0 == mpi_rank)
if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
if (0 == mpi_rank) {
if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) {
/* If image generation fails, push an error but
* still participate in the following MPI_Bcast
*/
HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
}
}
} /* end if */
HDassert(entry_ptr->image_ptr);

Expand Down Expand Up @@ -7202,8 +7207,20 @@ H5C__load_entry(H5F_t *f,
#ifdef H5_HAVE_PARALLEL
if (!coll_access || 0 == mpi_rank) {
#endif /* H5_HAVE_PARALLEL */
if (H5F_block_read(f, type->mem_type, addr, len, image) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")

if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) {

#ifdef H5_HAVE_PARALLEL
if (coll_access) {
/* Push an error, but still participate in following MPI_Bcast */
HDmemset(image, 0, len);
HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
}
else
#endif
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
}

#ifdef H5_HAVE_PARALLEL
} /* end if */
/* if the collective metadata read optimization is turned on,
Expand Down Expand Up @@ -7250,8 +7267,19 @@ H5C__load_entry(H5F_t *f,
* loaded thing, go get the on-disk image again (the extra portion).
*/
if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) <
0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
0) {

#ifdef H5_HAVE_PARALLEL
if (coll_access) {
/* Push an error, but still participate in following MPI_Bcast */
HDmemset(image + len, 0, actual_len - len);
HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
}
else
#endif
HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
}

#ifdef H5_HAVE_PARALLEL
}
/* If the collective metadata read optimization is turned on,
Expand Down
8 changes: 2 additions & 6 deletions src/H5CX.c
Original file line number Diff line number Diff line change
Expand Up @@ -1392,9 +1392,7 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,

/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
* calling this is supposed to be called collectively. Note that this
* happens only when the environment variable H5_COLL_BARRIER is set
* to non 0.
* calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
Expand Down Expand Up @@ -1451,9 +1449,7 @@ H5CX_set_loc(hid_t

/* If parallel is enabled and the file driver used is the MPI-IO
* VFD, issue an MPI barrier for easier debugging if the API function
* calling this is supposed to be called collectively. Note that this
* happens only when the environment variable H5_COLL_BARRIER is set
* to non 0.
* calling this is supposed to be called collectively.
*/
if (H5_coll_api_sanity_check_g) {
MPI_Comm mpi_comm; /* File communicator */
Expand Down
3 changes: 3 additions & 0 deletions src/H5Cimage.c
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
#endif /* H5_HAVE_PARALLEL */

/* Read the buffer (if serial access, or rank 0 of parallel access) */
/* NOTE: if this block read is being performed on rank 0 only, throwing
* an error here will cause other ranks to hang in the following MPI_Bcast.
*/
if (H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len,
cache_ptr->image_buffer) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")
Expand Down
13 changes: 10 additions & 3 deletions src/H5Dcontig.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,16 @@ H5D__contig_fill(const H5D_io_info_t *io_info)
if (using_mpi) {
/* Write the chunks out from only one process */
/* !! Use the internal "independent" DXPL!! -QAK */
if (H5_PAR_META_WRITE == mpi_rank)
if (H5D__contig_write_one(&ioinfo, offset, size) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
if (H5_PAR_META_WRITE == mpi_rank) {
if (H5D__contig_write_one(&ioinfo, offset, size) < 0) {
/* If writing fails, push an error and stop writing, but
* still participate in following MPI_Barrier.
*/
blocks_written = TRUE;
HDONE_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
break;
}
}

/* Indicate that blocks are being written */
blocks_written = TRUE;
Expand Down
45 changes: 33 additions & 12 deletions src/H5FDmpio.c
Original file line number Diff line number Diff line change
Expand Up @@ -860,14 +860,19 @@ H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR
file->mpi_size = mpi_size;

/* Only processor p0 will get the filesize and broadcast it. */
if (mpi_rank == 0)
if (mpi_rank == 0) {
/* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size)))
HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
file_size = (MPI_Offset)-1;
}

/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm)))
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)

if (file_size < 0)
HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)

/* Determine if the file should be truncated */
if (file_size && (flags & H5F_ACC_TRUNC)) {
/* Truncate the file */
Expand Down Expand Up @@ -1264,10 +1269,14 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
rank0_bcast = TRUE;

/* Read on rank 0 Bcast to other ranks */
if (file->mpi_rank == 0)
if (file->mpi_rank == 0) {
/* If MPI_File_read_at fails, push an error, but continue
* to participate in following MPI_Bcast */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
}

if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
Expand Down Expand Up @@ -1311,11 +1320,21 @@ H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNU
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
#if MPI_VERSION >= 3
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) {
#else
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read)))
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements(&mpi_stat, MPI_BYTE, &bytes_read))) {
#endif
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
if (rank0_bcast && file->mpi_rank == 0) {
/* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
* push an error, but continue to participate in the following
* MPI_Bcast.
*/
bytes_read = -1;
HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
}
else
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
}
} /* end if */

/* If the rank0-bcast feature was used, broadcast the # of bytes read to
Expand Down Expand Up @@ -1695,17 +1714,19 @@ H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5_ATTR
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)

/* Only processor p0 will get the filesize and broadcast it. */
/* (Note that throwing an error here will cause non-rank 0 processes
* to hang in following Bcast. -QAK, 3/17/2018)
*/
if (0 == file->mpi_rank)
if (0 == file->mpi_rank) {
/* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
size = (MPI_Offset)-1;
}

/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)

if (size < 0)
HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)

if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset")

Expand Down

0 comments on commit f6efaf8

Please sign in to comment.