Skip to content

Commit

Permalink
uses blocking comm if openmpi version is less than 4.1
Browse files Browse the repository at this point in the history
On two different systems using openmpi 4.0.x results in deadlocks in our distributed solver test. For versions 4.1.[34] the deadlock disappears, and intel mpi and mvapich2 also don't show a deadlock.
  • Loading branch information
MarcelKoch committed Oct 11, 2022
1 parent 69c8f65 commit 699d4da
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 0 deletions.
15 changes: 15 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,28 @@ else()
endif()

set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF)
if(GINKGO_BUILD_MPI)
find_package(MPI REQUIRED)
if(GINKGO_FORCE_GPU_AWARE_MPI)
set(GINKGO_HAVE_GPU_AWARE_MPI ON)
else()
set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
endif()

execute_process(COMMAND ${MPIEXEC_EXECUTABLE} --version
OUTPUT_VARIABLE mpiexec_output RESULT_VARIABLE mpiexec_result)
string(REGEX MATCH "[oO][pP][eE][nN].?[mM][pP][iI]" openmpi_match "${mpiexec_output}")
if(openmpi_match)
string(REGEX MATCH "([0-9]+\.?)+" openmpi_version "${mpiexec_output}")
if(openmpi_version VERSION_LESS "4.1")
message(WARNING
"OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed "
"matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or "
"switch to a different vendor.")
set(GINKGO_FORCE_SPMV_BLOCKING_COMM ON)
endif()
endif()
endif()

# Try to find the third party packages before using our subdirectories
Expand Down
8 changes: 8 additions & 0 deletions core/distributed/matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,18 @@ mpi::request Matrix<ValueType, LocalIndexType, GlobalIndexType>::communicate(
auto recv_ptr = use_host_buffer ? host_recv_buffer_->get_values()
: recv_buffer_->get_values();
exec->synchronize();
#ifdef GINKGO_FORCE_SPMV_BLOCKING_COMM
comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr,
send_sizes_.data(), send_offsets_.data(), type.get(),
recv_ptr, recv_sizes_.data(), recv_offsets_.data(),
type.get());
return {};
#else
return comm.i_all_to_all_v(
use_host_buffer ? exec->get_master() : exec, send_ptr,
send_sizes_.data(), send_offsets_.data(), type.get(), recv_ptr,
recv_sizes_.data(), recv_offsets_.data(), type.get());
#endif
}


Expand Down
6 changes: 6 additions & 0 deletions include/ginkgo/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -99,4 +99,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// clang-format on


/* Do we need to use blocking communication in our SpMV? */
// clang-format off
#cmakedefine GINKGO_FORCE_SPMV_BLOCKING_COMM
// clang-format on


#endif // GKO_INCLUDE_CONFIG_H

0 comments on commit 699d4da

Please sign in to comment.