HDFGroup · lrknox · Jan 22, 2022 · Nov 10, 2021 · Nov 10, 2021 · Jan 20, 2022
@@ -1039,6 +1039,18 @@ Bug Fixes since HDF5-1.12.0 release
 ===================================
     Library
     -------
+    - Fixed several potential MPI deadlocks in library failure conditions
+
+        In the parallel library, there were several places where MPI rank 0
+        could end up skipping past collective MPI operations when some failure
+        occurs in rank 0-specific processing. This would lead to deadlocks
+        where rank 0 completes an operation while other ranks wait in the
+        collective operation. These places have been rewritten to have rank 0
+        push an error and try to cleanup after the failure, then continue to
+        participate in the collective operation to the best of its ability.
+
+        (JTH - 2021/11/09)
+
     - Fixed an issue with collective metadata reads being permanently disabled
       after a dataset chunk lookup operation. This would usually cause a
       mismatched MPI_Bcast and MPI_ERR_TRUNCATE issue in the library for

@@ -1636,9 +1636,14 @@ H5AC_unprotect(H5F_t *f, const H5AC_class_t *type, haddr_t addr, void *thing, un
             if (H5AC__log_dirtied_entry((H5AC_info_t *)thing) < 0)
                 HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "can't log dirtied entry")
 
-        if (deleted && aux_ptr->mpi_rank == 0)
-            if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0)
-                HGOTO_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
+        if (deleted && aux_ptr->mpi_rank == 0) {
+            if (H5AC__log_deleted_entry((H5AC_info_t *)thing) < 0) {
+                /* If we fail to log the deleted entry, push an error but still
+                 * participate in a possible sync point ahead
+                 */
+                HDONE_ERROR(H5E_CACHE, H5E_CANTUNPROTECT, FAIL, "H5AC__log_deleted_entry() failed")
+            }
+        }
     }  /* end if */
 #endif /* H5_HAVE_PARALLEL */
 

@@ -304,8 +304,10 @@ H5AC__broadcast_candidate_list(H5AC_t *cache_ptr, unsigned *num_entries_ptr, had
          * are used to receiving from process 0, and also load it
          * into a buffer for transmission.
          */
-        if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0)
-            HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+        if (H5AC__copy_candidate_list_to_buffer(cache_ptr, &chk_num_entries, &haddr_buf_ptr) < 0) {
+            /* Push an error, but still participate in following MPI_Bcast */
+            HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate buffer.")
+        }
         HDassert(chk_num_entries == num_entries);
         HDassert(haddr_buf_ptr != NULL);
 
@@ -428,18 +430,23 @@ H5AC__broadcast_clean_list(H5AC_t *cache_ptr)
 
         /* allocate a buffer to store the list of entry base addresses in */
         buf_size = sizeof(haddr_t) * num_entries;
-        if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
-            HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
-
-        /* Set up user data for callback */
-        udata.aux_ptr      = aux_ptr;
-        udata.addr_buf_ptr = addr_buf_ptr;
-        udata.u            = 0;
-
-        /* Free all the clean list entries, building the address list in the callback */
-        /* (Callback also removes the matching entries from the dirtied list) */
-        if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0)
-            HGOTO_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
+        if (NULL == (addr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
+            /* Push an error, but still participate in following MPI_Bcast */
+            HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for addr buffer")
+        }
+        else {
+            /* Set up user data for callback */
+            udata.aux_ptr      = aux_ptr;
+            udata.addr_buf_ptr = addr_buf_ptr;
+            udata.u            = 0;
+
+            /* Free all the clean list entries, building the address list in the callback */
+            /* (Callback also removes the matching entries from the dirtied list) */
+            if (H5SL_free(aux_ptr->c_slist_ptr, H5AC__broadcast_clean_list_cb, &udata) < 0) {
+                /* Push an error, but still participate in following MPI_Bcast */
+                HDONE_ERROR(H5E_CACHE, H5E_CANTFREE, FAIL, "Can't build address list for clean entries")
+            }
+        }
 
         /* Now broadcast the list of cleaned entries */
         if (MPI_SUCCESS !=
@@ -1448,8 +1455,10 @@ H5AC__receive_haddr_list(MPI_Comm mpi_comm, unsigned *num_entries_ptr, haddr_t *
 
         /* allocate buffers to store the list of entry base addresses in */
         buf_size = sizeof(haddr_t) * num_entries;
-        if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size)))
-            HGOTO_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+        if (NULL == (haddr_buf_ptr = (haddr_t *)H5MM_malloc(buf_size))) {
+            /* Push an error, but still participate in following MPI_Bcast */
+            HDONE_ERROR(H5E_CACHE, H5E_CANTALLOC, FAIL, "memory allocation failed for haddr buffer")
+        }
 
         /* Now receive the list of candidate entries */
         if (MPI_SUCCESS !=
@@ -1800,10 +1809,14 @@ H5AC__rsp__dist_md_write__flush_to_min_clean(H5F_t *f)
 
     if (evictions_enabled) {
         /* construct candidate list -- process 0 only */
-        if (aux_ptr->mpi_rank == 0)
+        if (aux_ptr->mpi_rank == 0) {
+            /* If constructing candidate list fails, push an error but still participate
+             * in collective operations during following candidate list propagation
+             */
             if (H5AC__construct_candidate_list(cache_ptr, aux_ptr, H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN) <
                 0)
-                HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+                HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't construct candidate list.")
+        }
 
         /* propagate and apply candidate list -- all processes */
         if (H5AC__propagate_and_apply_candidate_list(f) < 0)
@@ -1899,15 +1912,21 @@ H5AC__rsp__p0_only__flush(H5F_t *f)
         aux_ptr->write_permitted = FALSE;
 
         /* Check for error on the write operation */
-        if (result < 0)
-            HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
-
-        /* this code exists primarily for the test bed -- it allows us to
-         * enforce POSIX semantics on the server that pretends to be a
-         * file system in our parallel tests.
-         */
-        if (aux_ptr->write_done)
-            (aux_ptr->write_done)();
+        if (result < 0) {
+            /* If write operation fails, push an error but still participate
+             * in collective operations during following cache entry
+             * propagation
+             */
+            HDONE_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "Can't flush.")
+        }
+        else {
+            /* this code exists primarily for the test bed -- it allows us to
+             * enforce POSIX semantics on the server that pretends to be a
+             * file system in our parallel tests.
+             */
+            if (aux_ptr->write_done)
+                (aux_ptr->write_done)();
+        }
     } /* end if */
 
     /* Propagate cleaned entries to other ranks. */
@@ -2019,15 +2038,21 @@ H5AC__rsp__p0_only__flush_to_min_clean(H5F_t *f)
             aux_ptr->write_permitted = FALSE;
 
             /* Check for error on the write operation */
-            if (result < 0)
-                HGOTO_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
-
-            /* this call exists primarily for the test code -- it is used
-             * to enforce POSIX semantics on the process used to simulate
-             * reads and writes in t_cache.c.
-             */
-            if (aux_ptr->write_done)
-                (aux_ptr->write_done)();
+            if (result < 0) {
+                /* If write operation fails, push an error but still participate
+                 * in collective operations during following cache entry
+                 * propagation
+                 */
+                HDONE_ERROR(H5E_CACHE, H5E_SYSTEM, FAIL, "H5C_flush_to_min_clean() failed.")
+            }
+            else {
+                /* this call exists primarily for the test code -- it is used
+                 * to enforce POSIX semantics on the process used to simulate
+                 * reads and writes in t_cache.c.
+                 */
+                if (aux_ptr->write_done)
+                    (aux_ptr->write_done)();
+            }
         } /* end if */
 
         if (H5AC__propagate_flushed_and_still_clean_entries_list(f) < 0)

@@ -2307,9 +2307,14 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
                     H5MM_memcpy(((uint8_t *)entry_ptr->image_ptr) + entry_ptr->size, H5C_IMAGE_SANITY_VALUE,
                                 H5C_IMAGE_EXTRA_SPACE);
 #endif /* H5C_DO_MEMORY_SANITY_CHECKS */
-                    if (0 == mpi_rank)
-                        if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0)
-                            HGOTO_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+                    if (0 == mpi_rank) {
+                        if (H5C__generate_image(f, cache_ptr, entry_ptr) < 0) {
+                            /* If image generation fails, push an error but
+                             * still participate in the following MPI_Bcast
+                             */
+                            HDONE_ERROR(H5E_CACHE, H5E_CANTGET, NULL, "can't generate entry's image")
+                        }
+                    }
                 } /* end if */
                 HDassert(entry_ptr->image_ptr);
 
@@ -7182,8 +7187,20 @@ H5C__load_entry(H5F_t *f,
 #ifdef H5_HAVE_PARALLEL
             if (!coll_access || 0 == mpi_rank) {
 #endif /* H5_HAVE_PARALLEL */
-                if (H5F_block_read(f, type->mem_type, addr, len, image) < 0)
-                    HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+
+                if (H5F_block_read(f, type->mem_type, addr, len, image) < 0) {
+
+#ifdef H5_HAVE_PARALLEL
+                    if (coll_access) {
+                        /* Push an error, but still participate in following MPI_Bcast */
+                        HDmemset(image, 0, len);
+                        HDONE_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+                    }
+                    else
+#endif
+                        HGOTO_ERROR(H5E_CACHE, H5E_READERROR, NULL, "Can't read image*")
+                }
+
 #ifdef H5_HAVE_PARALLEL
             } /* end if */
             /* if the collective metadata read optimization is turned on,
@@ -7230,8 +7247,19 @@ H5C__load_entry(H5F_t *f,
                              * loaded thing, go get the on-disk image again (the extra portion).
                              */
                             if (H5F_block_read(f, type->mem_type, addr + len, actual_len - len, image + len) <
-                                0)
-                                HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+                                0) {
+
+#ifdef H5_HAVE_PARALLEL
+                                if (coll_access) {
+                                    /* Push an error, but still participate in following MPI_Bcast */
+                                    HDmemset(image + len, 0, actual_len - len);
+                                    HDONE_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+                                }
+                                else
+#endif
+                                    HGOTO_ERROR(H5E_CACHE, H5E_CANTLOAD, NULL, "can't read image")
+                            }
+
 #ifdef H5_HAVE_PARALLEL
                         }
                         /* If the collective metadata read optimization is turned on,

@@ -1397,9 +1397,7 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,
 
         /* If parallel is enabled and the file driver used is the MPI-IO
          * VFD, issue an MPI barrier for easier debugging if the API function
-         * calling this is supposed to be called collectively. Note that this
-         * happens only when the environment variable H5_COLL_BARRIER is set
-         * to non 0.
+         * calling this is supposed to be called collectively.
          */
         if (H5_coll_api_sanity_check_g) {
             MPI_Comm mpi_comm; /* File communicator */
@@ -1456,9 +1454,7 @@ H5CX_set_loc(hid_t
 
     /* If parallel is enabled and the file driver used is the MPI-IO
      * VFD, issue an MPI barrier for easier debugging if the API function
-     * calling this is supposed to be called collectively. Note that this
-     * happens only when the environment variable H5_COLL_BARRIER is set
-     * to non 0.
+     * calling this is supposed to be called collectively.
      */
     if (H5_coll_api_sanity_check_g) {
         MPI_Comm mpi_comm; /* File communicator */

@@ -997,6 +997,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
 #endif /* H5_HAVE_PARALLEL */
 
             /* Read the buffer (if serial access, or rank 0 of parallel access) */
+            /* NOTE: if this block read is being performed on rank 0 only, throwing
+             * an error here will cause other ranks to hang in the following MPI_Bcast.
+             */
             if (H5F_block_read(f, H5FD_MEM_SUPER, cache_ptr->image_addr, cache_ptr->image_len,
                                cache_ptr->image_buffer) < 0)
                 HGOTO_ERROR(H5E_CACHE, H5E_READERROR, FAIL, "Can't read metadata cache image block")

@@ -268,9 +268,16 @@ H5D__contig_fill(const H5D_io_info_t *io_info)
         if (using_mpi) {
             /* Write the chunks out from only one process */
             /* !! Use the internal "independent" DXPL!! -QAK */
-            if (H5_PAR_META_WRITE == mpi_rank)
-                if (H5D__contig_write_one(&ioinfo, offset, size) < 0)
-                    HGOTO_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
+            if (H5_PAR_META_WRITE == mpi_rank) {
+                if (H5D__contig_write_one(&ioinfo, offset, size) < 0) {
+                    /* If writing fails, push an error and stop writing, but
+                     * still participate in following MPI_Barrier.
+                     */
+                    blocks_written = TRUE;
+                    HDONE_ERROR(H5E_DATASET, H5E_CANTINIT, FAIL, "unable to write fill value to dataset")
+                    break;
+                }
+            }
 
             /* Indicate that blocks are being written */
             blocks_written = TRUE;

@@ -2360,8 +2360,16 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
             HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
 
         if (mpi_rank == 0) {
-            if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0)
-                HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+            if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
+                size_t u;
+
+                /* Clear total chunk address array */
+                for (u = 0; u < (size_t)fm->layout->u.chunk.nchunks; u++)
+                    total_chunk_addr_array[u] = HADDR_UNDEF;
+
+                /* Push error, but still participate in following MPI_Bcast */
+                HDONE_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't get chunk address")
+            }
         } /* end if */
 
         /* Broadcasting the MPI_IO option info. and chunk address info. */