From 69794af912ee563aa11ed1f5e49ff9bdd5962559 Mon Sep 17 00:00:00 2001 From: Jean-Luc Fattebert Date: Fri, 9 Sep 2022 15:19:43 -0400 Subject: [PATCH] Couple with ELPA --- CMakeLists.txt | 19 ++ build.sh | 6 + cmake/FindELPA.cmake | 15 ++ scripts/build_olcf_summit_gnu_elpa_essl.sh | 51 +++++ .../bml_diagonalize_distributed2d.h | 12 ++ .../bml_diagonalize_distributed2d_typed.c | 204 +++++++++++++++++- tests/C-tests/bml_test.c | 2 +- 7 files changed, 303 insertions(+), 6 deletions(-) create mode 100644 cmake/FindELPA.cmake create mode 100755 scripts/build_olcf_summit_gnu_elpa_essl.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c1fee275..a2f2492fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -382,6 +382,25 @@ if(BML_MAGMA) endif() endif() +set(BML_ELPA FALSE CACHE BOOL "Whether to use ELPA library") +if(BML_ELPA) + message(STATUS "Search for ELPA in directory ${ELPA_DIR}\n") + find_package(ELPA REQUIRED) + + if(${ELPA_FOUND}) + message(STATUS "ELPA was found:\n" + " ELPA_INCLUDE_DIRS: ${ELPA_INCLUDE_DIRS}\n" + " ELPA_LIBRARY_DIRS: ${ELPA_LIBRARY_DIRS}\n" + " ELPA_LIBRARIES: ${ELPA_LIBRARIES}" + ) + add_definitions(-DBML_USE_ELPA) + include_directories(${ELPA_INCLUDE_DIRS}) + link_directories(${CUDAToolkit_LIBRARY_DIR}) + link_directories(${ELPA_LIBRARY_DIRS}) + list(APPEND LINK_LIBRARIES ${ELPA_LIBRARIES}) + endif() +endif() + set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library") if(BML_SCALAPACK) add_definitions(-DBML_USE_SCALAPACK) diff --git a/build.sh b/build.sh index ac01ab6e7..b3b336cc8 100755 --- a/build.sh +++ b/build.sh @@ -94,6 +94,8 @@ EOF echo "BML_XSMM Build with XSMM (default is ${BML_XSMM})" echo "BML_SCALAPACK Build with SCALAPACK (default is ${BML_SCALAPACK})" echo "SCALAPACK_LIBRARIES ScaLapack libraries (default is '${SCALAPACK_LIBRARIES}')" + echo "BML_ELPA Build with ELPA (default is ${BML_ELPA})" + echo "ELPA_DIR ELPA directory (default is ${ELPA_DIR})" echo "BML_ELLBLOCK_MEMPOOL Use ellblock memory pool (default is ${BML_ELLBLOCK_MEMPOOL}" echo "CUDA_TOOLKIT_ROOT_DIR Path to CUDA dir (default is '${CUDA_TOOLKIT_ROOT_DIR}')" echo "INTEL_OPT {yes, no} (default is ${INTEL_OPT})" @@ -125,6 +127,8 @@ set_defaults() { : ${BLAS_LIBRARIES:=} : ${LAPACK_LIBRARIES:=} : ${SCALAPACK_LIBRARIES:=} + : ${BML_ELPA:=no} + : ${ELPA_DIR:=} : ${BML_TESTING:=yes} : ${BML_VALGRIND:=no} : ${BML_COVERAGE:=no} @@ -214,6 +218,7 @@ configure() { -DCMAKE_C_COMPILER="${CC}" \ -DCMAKE_CXX_COMPILER="${CXX}" \ -DCMAKE_Fortran_COMPILER="${FC}" \ + -DCMAKE_PREFIX_PATH="${ELPA_DIR}" \ ${CMAKE_C_FLAGS:+-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}"} \ ${CMAKE_CXX_FLAGS:+-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}"} \ ${CMAKE_Fortran_FLAGS:+-DCMAKE_Fortran_FLAGS="${CMAKE_Fortran_FLAGS}"} \ @@ -221,6 +226,7 @@ configure() { -DBLAS_LIBRARIES="${BLAS_LIBRARIES}" \ -DLAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \ -DSCALAPACK_LIBRARIES="${SCALAPACK_LIBRARIES}" \ + -DBML_ELPA="${BML_ELPA}" \ -DBML_OPENMP="${BML_OPENMP}" \ -DMKL_GPU="${MKL_GPU}" \ -DBML_MPI="${BML_MPI}" \ diff --git a/cmake/FindELPA.cmake b/cmake/FindELPA.cmake new file mode 100644 index 000000000..1ec2bace2 --- /dev/null +++ b/cmake/FindELPA.cmake @@ -0,0 +1,15 @@ +# - Find the ELPA library +# +# Usage: +# find_package(ELPA [REQUIRED] [QUIET] ) +# +# It sets the following variables: +# ELPA_FOUND ... true if elpa is found on the system +# ELPA_LIBRARY_DIRS ... full path to elpa library +# ELPA_INCLUDE_DIRS ... elpa include directory +# ELPA_LIBRARIES ... elpa libraries + + +find_package(PkgConfig REQUIRED) +pkg_check_modules(ELPA REQUIRED elpa IMPORTED_TARGET) + diff --git a/scripts/build_olcf_summit_gnu_elpa_essl.sh b/scripts/build_olcf_summit_gnu_elpa_essl.sh new file mode 100755 index 000000000..182fade14 --- /dev/null +++ b/scripts/build_olcf_summit_gnu_elpa_essl.sh @@ -0,0 +1,51 @@ +#!/bin/bash +module load cmake +module load cuda +module load gcc/10.2.0 +module load essl +module load magma +module load netlib-scalapack +module load netlib-lapack + +rm -r build +rm -r install + +MY_PATH=$(pwd) + +#get jsrun with full path +JSRUN=$(which jsrun) +echo ${JSRUN} + +export MAGMA_ROOT=${OLCF_MAGMA_ROOT:="${OLCF_MAGMA_ROOT}"} +export CC=${CC:=mpicc} +export FC=${FC:=mpif90} +export CXX=${CXX:=mpiCC} +export BML_OPENMP=${BML_OPENMP:=yes} +export BML_MPI=${BML_MPI:=yes} +export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=no} +export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} +export BML_TESTING=${BML_TESTING:=yes} +export BML_MAGMA=${BML_MAGMA:=yes} +export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Debug} + +#set BLAS explicitly, otherwise cmake will pick the serial version of essl +export BLAS_LIBRARIES=${BLAS_LIBRARIES:="$OLCF_ESSL_ROOT/lib64/libesslsmp.so"} +#since essl does not contain all the lapack functions needed, we still need lapack +export LAPACK_LIBRARIES=${LAPACK_LIBRARIES:="$OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.so"} +export BML_SCALAPACK=${BML_SCALAPACK:=yes} +export SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES:="-L$OLCF_NETLIB_SCALAPACK_ROOT/lib -lscalapack"} + +export BML_CUDA=${BML_CUDA:=yes} +export BML_ELPA=${BML_ELPA:=yes} +export ELPA_DIR=${ELPA_DIR:=/ccs/proj/csc304/elpa} +export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lgfortran"} + +#use jsrun to run tests on a compute node +export BML_NONMPI_PRECOMMAND=${BML_NONMPI_PRECOMMAND:=${JSRUN}} +export BML_NONMPI_PRECOMMAND_ARGS=${BML_NONMPI_PRECOMMAND_ARGS:="-n1;-a1;-g1;-c7;--smpiargs=off"} + +export BML_MPIEXEC_EXECUTABLE=${BML_MPIEXEC_EXECUTABLE:=${JSRUN}} +export BML_MPIEXEC_NUMPROCS_FLAG=${BML_MPIEXEC_NUMPROCS_FLAG:="-n"} +export BML_MPIEXEC_PREFLAGS=${BML_MPIEXEC_PREFLAGS:="-a1;-c4;-bpacked:2;-g1"} + +./build.sh install diff --git a/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h b/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h index e3f14e41f..a3055e2cd 100644 --- a/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h +++ b/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h @@ -28,4 +28,16 @@ void bml_diagonalize_distributed2d_double_complex( void *eigenvalues, bml_matrix_distributed2d_t * eigenvectors); +#ifdef BML_USE_ELPA +void bml_diagonalize_distributed2d_elpa_single_real( + bml_matrix_distributed2d_t * A, + void *eigenvalues, + bml_matrix_distributed2d_t * eigenvectors); + +void bml_diagonalize_distributed2d_elpa_double_real( + bml_matrix_distributed2d_t * A, + void *eigenvalues, + bml_matrix_distributed2d_t * eigenvectors); +#endif + #endif diff --git a/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c b/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c index 209765e54..6d008c8d9 100644 --- a/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c +++ b/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c @@ -14,6 +14,16 @@ #include "../bml_transpose.h" #include "../bml_copy.h" +#ifdef BML_USE_ELPA +#include +#include +#include "../dense/bml_allocate_dense.h" +#ifdef BML_USE_MAGMA +#include "../../typed.h" +#include "magma_v2.h" +#endif +#endif + #include #include @@ -139,13 +149,13 @@ void PZHEEVD( * \param eigenvalues Eigenvalues of A * \param eigenvectors Eigenvectors of A */ +#ifdef BML_USE_SCALAPACK void TYPED_FUNC( - bml_diagonalize_distributed2d) ( + bml_diagonalize_distributed2d_scalapack) ( bml_matrix_distributed2d_t * A, void *eigenvalues, bml_matrix_distributed2d_t * eigenvectors) { -#ifdef BML_USE_SCALAPACK REAL_T *typed_eigenvalues = (REAL_T *) eigenvalues; // distributed2d format uses a row block distribution char order = 'R'; @@ -288,11 +298,195 @@ void TYPED_FUNC( A->M / A->npcols, sequential); bml_deallocate(&zmat); } - // transpose eigenvectors to have them stored row-major - bml_transpose(eigenvectors->matrix); + return; +} +#endif + +#ifdef BML_USE_ELPA +// Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V. +// "GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems", +// Computer Physics Communications, 262, 2021 +void TYPED_FUNC( + bml_diagonalize_distributed2d_elpa) ( + bml_matrix_distributed2d_t * A, + void *eigenvalues, + bml_matrix_distributed2d_t * eigenvectors) +{ + char order = 'R'; + int np_rows = A->nprows; + int np_cols = A->npcols; + int my_prow = A->myprow; + int my_pcol = A->mypcol; + int my_blacs_ctxt = Csys2blacs_handle(A->comm); + Cblacs_gridinit(&my_blacs_ctxt, &order, np_rows, np_cols); + Cblacs_gridinfo(my_blacs_ctxt, &np_rows, &np_cols, &my_prow, &my_pcol); + + int na = A->N; + int na_rows = na / np_rows; + int na_cols = na / np_cols; + if (na_rows * np_rows != na) + { + LOG_ERROR("Number of MPI tasks/row should divide matrix size\n"); + } + //printf("Matrix size: %d\n", na); + //printf("Number of MPI process rows: %d\n", np_rows); + //printf("Number of MPI process cols: %d\n", np_cols); + + if (elpa_init(ELPA_API_VERSION) != ELPA_OK) + { + LOG_ERROR("Error: ELPA API version not supported"); + } + + int error_elpa; + elpa_t handle = elpa_allocate(&error_elpa); + /* Set parameters */ + elpa_set(handle, "na", (int) na, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "nev", (int) na, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa); + assert(error_elpa == ELPA_OK); + + // use one block/MPI task, so sets block size to no. local rows + elpa_set(handle, "nblk", (int) na_rows, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(A->comm)), + &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "process_row", (int) my_prow, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "process_col", (int) my_pcol, &error_elpa); + assert(error_elpa == ELPA_OK); + + MPI_Barrier(MPI_COMM_WORLD); + + int success = elpa_setup(handle); + assert(success == ELPA_OK); + + elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa); + assert(error_elpa == ELPA_OK); + + elpa_set(handle, "gpu", 1, &error_elpa); + assert(error_elpa == ELPA_OK); + + bml_matrix_t *Alocal = A->matrix; + + bml_matrix_t *zmat = NULL; + bml_matrix_t *amat = NULL; + if (bml_get_type(Alocal) == dense) + { + amat = bml_copy_new(Alocal); + zmat = eigenvectors->matrix; + } + else + { + LOG_INFO("WARNING: convert local matrices to dense...\n"); + // convert local matrix to dense + amat = bml_convert(Alocal, dense, A->matrix_precision, + -1, sequential); + zmat = bml_convert(eigenvectors->matrix, dense, A->matrix_precision, + -1, sequential); + } + + // transpose to satisfy column major ELPA convention + // (global matrix assumed symmetric, so no need for communications) + if (A->myprow != A->mypcol) + bml_transpose(amat); + + REAL_T *z = bml_get_data_ptr(zmat); + assert(z != NULL); + REAL_T *a = bml_get_data_ptr(amat); + assert(a != NULL); + + /* Solve EV problem */ + // interface: see elpa_generic.h + // handle handle of the ELPA object, which defines the problem + // a device pointer to matrix a in GPU memory + // ev on return: pointer to eigenvalues in GPU memory + // q on return: pointer to eigenvectors in GPU memory + // error on return the error code, which can be queried with elpa_strerr() + LOG_DEBUG("Call ELPA eigensolver"); +#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX) + float *ev; + magma_int_t ret = magma_smalloc(&ev, na); +#else + double *ev; + magma_int_t ret = magma_dmalloc(&ev, na); +#endif + assert(ret == MAGMA_SUCCESS); +#if defined(SINGLE_REAL) + elpa_eigenvectors_float(handle, a, ev, z, &error_elpa); +#endif +#if defined(DOUBLE_REAL) + elpa_eigenvectors_double(handle, a, ev, z, &error_elpa); +#endif +#if defined(SINGLE_COMPLEX) + elpa_eigenvectors_float_complex(handle, a, ev, z, &error_elpa); +#endif +#if defined(DOUBLE_COMPLEX) + elpa_eigenvectors_double_complex(handle, a, ev, z, &error_elpa); +#endif + + assert(error_elpa == ELPA_OK); + // copy eigenvalues to CPU + LOG_DEBUG("copy eigenvalues to CPU"); +#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX) + float *tmp = malloc(na * sizeof(float)); + magma_sgetvector(na, ev, 1, tmp, 1, bml_queue()); +#endif +#if defined(DOUBLE_REAL) || defined(DOUBLE_COMPLEX) + double *tmp = malloc(na * sizeof(double)); + magma_dgetvector(na, ev, 1, tmp, 1, bml_queue()); +#endif + magma_queue_sync(bml_queue()); + + REAL_T *ev_ptr = eigenvalues; + for (int i = 0; i < A->N; i++) + ev_ptr[i] = (REAL_T) tmp[i]; + free(tmp); + + magma_free(ev); + + bml_deallocate(&amat); + if (bml_get_type(Alocal) != dense) + { + bml_deallocate(&(eigenvectors->matrix)); + eigenvectors->matrix = + bml_convert(zmat, bml_get_type(Alocal), A->matrix_precision, + A->M / A->npcols, sequential); + bml_deallocate(&zmat); + } + + elpa_deallocate(handle, &error_elpa); +} +#endif + +void TYPED_FUNC( + bml_diagonalize_distributed2d) ( + bml_matrix_distributed2d_t * A, + void *eigenvalues, + bml_matrix_distributed2d_t * eigenvectors) +{ +#ifdef BML_USE_ELPA + TYPED_FUNC(bml_diagonalize_distributed2d_elpa) (A, eigenvalues, + eigenvectors); +#else +#ifdef BML_USE_SCALAPACK + TYPED_FUNC(bml_diagonalize_distributed2d_scalapack) (A, eigenvalues, + eigenvectors); #else LOG_ERROR ("Build with ScaLAPACK required for distributed2d diagonalization\n"); #endif - return; +#endif + // transpose eigenvectors to have them stored row-major + bml_transpose(eigenvectors->matrix); } diff --git a/tests/C-tests/bml_test.c b/tests/C-tests/bml_test.c index 98b917627..999bf5648 100644 --- a/tests/C-tests/bml_test.c +++ b/tests/C-tests/bml_test.c @@ -187,7 +187,7 @@ main( MPI_Init(&argc, &argv); bml_init(MPI_COMM_WORLD); printf("with MPI\n"); - int N = 14; + int N = 64; #else bml_init(); int N = 13;