Skip to content

Commit

Permalink
Couple with ELPA
Browse files Browse the repository at this point in the history
  • Loading branch information
jeanlucf22 committed Nov 4, 2022
1 parent 5599297 commit 69794af
Show file tree
Hide file tree
Showing 7 changed files with 303 additions and 6 deletions.
19 changes: 19 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,25 @@ if(BML_MAGMA)
endif()
endif()

set(BML_ELPA FALSE CACHE BOOL "Whether to use ELPA library")
if(BML_ELPA)
message(STATUS "Search for ELPA in directory ${ELPA_DIR}\n")
find_package(ELPA REQUIRED)

if(${ELPA_FOUND})
message(STATUS "ELPA was found:\n"
" ELPA_INCLUDE_DIRS: ${ELPA_INCLUDE_DIRS}\n"
" ELPA_LIBRARY_DIRS: ${ELPA_LIBRARY_DIRS}\n"
" ELPA_LIBRARIES: ${ELPA_LIBRARIES}"
)
add_definitions(-DBML_USE_ELPA)
include_directories(${ELPA_INCLUDE_DIRS})
link_directories(${CUDAToolkit_LIBRARY_DIR})
link_directories(${ELPA_LIBRARY_DIRS})
list(APPEND LINK_LIBRARIES ${ELPA_LIBRARIES})
endif()
endif()

set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library")
if(BML_SCALAPACK)
add_definitions(-DBML_USE_SCALAPACK)
Expand Down
6 changes: 6 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ EOF
echo "BML_XSMM Build with XSMM (default is ${BML_XSMM})"
echo "BML_SCALAPACK Build with SCALAPACK (default is ${BML_SCALAPACK})"
echo "SCALAPACK_LIBRARIES ScaLapack libraries (default is '${SCALAPACK_LIBRARIES}')"
echo "BML_ELPA Build with ELPA (default is ${BML_ELPA})"
echo "ELPA_DIR ELPA directory (default is ${ELPA_DIR})"
echo "BML_ELLBLOCK_MEMPOOL Use ellblock memory pool (default is ${BML_ELLBLOCK_MEMPOOL}"
echo "CUDA_TOOLKIT_ROOT_DIR Path to CUDA dir (default is '${CUDA_TOOLKIT_ROOT_DIR}')"
echo "INTEL_OPT {yes, no} (default is ${INTEL_OPT})"
Expand Down Expand Up @@ -125,6 +127,8 @@ set_defaults() {
: ${BLAS_LIBRARIES:=}
: ${LAPACK_LIBRARIES:=}
: ${SCALAPACK_LIBRARIES:=}
: ${BML_ELPA:=no}
: ${ELPA_DIR:=}
: ${BML_TESTING:=yes}
: ${BML_VALGRIND:=no}
: ${BML_COVERAGE:=no}
Expand Down Expand Up @@ -214,13 +218,15 @@ configure() {
-DCMAKE_C_COMPILER="${CC}" \
-DCMAKE_CXX_COMPILER="${CXX}" \
-DCMAKE_Fortran_COMPILER="${FC}" \
-DCMAKE_PREFIX_PATH="${ELPA_DIR}" \
${CMAKE_C_FLAGS:+-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}"} \
${CMAKE_CXX_FLAGS:+-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}"} \
${CMAKE_Fortran_FLAGS:+-DCMAKE_Fortran_FLAGS="${CMAKE_Fortran_FLAGS}"} \
-DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
-DBLAS_LIBRARIES="${BLAS_LIBRARIES}" \
-DLAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \
-DSCALAPACK_LIBRARIES="${SCALAPACK_LIBRARIES}" \
-DBML_ELPA="${BML_ELPA}" \
-DBML_OPENMP="${BML_OPENMP}" \
-DMKL_GPU="${MKL_GPU}" \
-DBML_MPI="${BML_MPI}" \
Expand Down
15 changes: 15 additions & 0 deletions cmake/FindELPA.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# - Find the ELPA library
#
# Usage:
# find_package(ELPA [REQUIRED] [QUIET] )
#
# It sets the following variables:
# ELPA_FOUND ... true if elpa is found on the system
# ELPA_LIBRARY_DIRS ... full path to elpa library
# ELPA_INCLUDE_DIRS ... elpa include directory
# ELPA_LIBRARIES ... elpa libraries


find_package(PkgConfig REQUIRED)
pkg_check_modules(ELPA REQUIRED elpa IMPORTED_TARGET)

51 changes: 51 additions & 0 deletions scripts/build_olcf_summit_gnu_elpa_essl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
module load cmake
module load cuda
module load gcc/10.2.0
module load essl
module load magma
module load netlib-scalapack
module load netlib-lapack

rm -r build
rm -r install

MY_PATH=$(pwd)

#get jsrun with full path
JSRUN=$(which jsrun)
echo ${JSRUN}

export MAGMA_ROOT=${OLCF_MAGMA_ROOT:="${OLCF_MAGMA_ROOT}"}
export CC=${CC:=mpicc}
export FC=${FC:=mpif90}
export CXX=${CXX:=mpiCC}
export BML_OPENMP=${BML_OPENMP:=yes}
export BML_MPI=${BML_MPI:=yes}
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=no}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_TESTING=${BML_TESTING:=yes}
export BML_MAGMA=${BML_MAGMA:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Debug}

#set BLAS explicitly, otherwise cmake will pick the serial version of essl
export BLAS_LIBRARIES=${BLAS_LIBRARIES:="$OLCF_ESSL_ROOT/lib64/libesslsmp.so"}
#since essl does not contain all the lapack functions needed, we still need lapack
export LAPACK_LIBRARIES=${LAPACK_LIBRARIES:="$OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.so"}
export BML_SCALAPACK=${BML_SCALAPACK:=yes}
export SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES:="-L$OLCF_NETLIB_SCALAPACK_ROOT/lib -lscalapack"}

export BML_CUDA=${BML_CUDA:=yes}
export BML_ELPA=${BML_ELPA:=yes}
export ELPA_DIR=${ELPA_DIR:=/ccs/proj/csc304/elpa}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lgfortran"}

#use jsrun to run tests on a compute node
export BML_NONMPI_PRECOMMAND=${BML_NONMPI_PRECOMMAND:=${JSRUN}}
export BML_NONMPI_PRECOMMAND_ARGS=${BML_NONMPI_PRECOMMAND_ARGS:="-n1;-a1;-g1;-c7;--smpiargs=off"}

export BML_MPIEXEC_EXECUTABLE=${BML_MPIEXEC_EXECUTABLE:=${JSRUN}}
export BML_MPIEXEC_NUMPROCS_FLAG=${BML_MPIEXEC_NUMPROCS_FLAG:="-n"}
export BML_MPIEXEC_PREFLAGS=${BML_MPIEXEC_PREFLAGS:="-a1;-c4;-bpacked:2;-g1"}

./build.sh install
12 changes: 12 additions & 0 deletions src/C-interface/distributed2d/bml_diagonalize_distributed2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,16 @@ void bml_diagonalize_distributed2d_double_complex(
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors);

#ifdef BML_USE_ELPA
void bml_diagonalize_distributed2d_elpa_single_real(
bml_matrix_distributed2d_t * A,
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors);

void bml_diagonalize_distributed2d_elpa_double_real(
bml_matrix_distributed2d_t * A,
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors);
#endif

#endif
204 changes: 199 additions & 5 deletions src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@
#include "../bml_transpose.h"
#include "../bml_copy.h"

#ifdef BML_USE_ELPA
#include <elpa/elpa.h>
#include <elpa/elpa_generic.h>
#include "../dense/bml_allocate_dense.h"
#ifdef BML_USE_MAGMA
#include "../../typed.h"
#include "magma_v2.h"
#endif
#endif

#include <mpi.h>

#include <complex.h>
Expand Down Expand Up @@ -139,13 +149,13 @@ void PZHEEVD(
* \param eigenvalues Eigenvalues of A
* \param eigenvectors Eigenvectors of A
*/
#ifdef BML_USE_SCALAPACK
void TYPED_FUNC(
bml_diagonalize_distributed2d) (
bml_diagonalize_distributed2d_scalapack) (
bml_matrix_distributed2d_t * A,
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors)
{
#ifdef BML_USE_SCALAPACK
REAL_T *typed_eigenvalues = (REAL_T *) eigenvalues;
// distributed2d format uses a row block distribution
char order = 'R';
Expand Down Expand Up @@ -288,11 +298,195 @@ void TYPED_FUNC(
A->M / A->npcols, sequential);
bml_deallocate(&zmat);
}
// transpose eigenvectors to have them stored row-major
bml_transpose(eigenvectors->matrix);
return;
}
#endif

#ifdef BML_USE_ELPA
// Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V.
// "GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems",
// Computer Physics Communications, 262, 2021
void TYPED_FUNC(
bml_diagonalize_distributed2d_elpa) (
bml_matrix_distributed2d_t * A,
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors)
{
char order = 'R';
int np_rows = A->nprows;
int np_cols = A->npcols;
int my_prow = A->myprow;
int my_pcol = A->mypcol;
int my_blacs_ctxt = Csys2blacs_handle(A->comm);
Cblacs_gridinit(&my_blacs_ctxt, &order, np_rows, np_cols);
Cblacs_gridinfo(my_blacs_ctxt, &np_rows, &np_cols, &my_prow, &my_pcol);

int na = A->N;
int na_rows = na / np_rows;
int na_cols = na / np_cols;
if (na_rows * np_rows != na)
{
LOG_ERROR("Number of MPI tasks/row should divide matrix size\n");
}
//printf("Matrix size: %d\n", na);
//printf("Number of MPI process rows: %d\n", np_rows);
//printf("Number of MPI process cols: %d\n", np_cols);

if (elpa_init(ELPA_API_VERSION) != ELPA_OK)
{
LOG_ERROR("Error: ELPA API version not supported");
}

int error_elpa;
elpa_t handle = elpa_allocate(&error_elpa);
/* Set parameters */
elpa_set(handle, "na", (int) na, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "nev", (int) na, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
assert(error_elpa == ELPA_OK);

// use one block/MPI task, so sets block size to no. local rows
elpa_set(handle, "nblk", (int) na_rows, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(A->comm)),
&error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
assert(error_elpa == ELPA_OK);

MPI_Barrier(MPI_COMM_WORLD);

int success = elpa_setup(handle);
assert(success == ELPA_OK);

elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa);
assert(error_elpa == ELPA_OK);

elpa_set(handle, "gpu", 1, &error_elpa);
assert(error_elpa == ELPA_OK);

bml_matrix_t *Alocal = A->matrix;

bml_matrix_t *zmat = NULL;
bml_matrix_t *amat = NULL;
if (bml_get_type(Alocal) == dense)
{
amat = bml_copy_new(Alocal);
zmat = eigenvectors->matrix;
}
else
{
LOG_INFO("WARNING: convert local matrices to dense...\n");
// convert local matrix to dense
amat = bml_convert(Alocal, dense, A->matrix_precision,
-1, sequential);
zmat = bml_convert(eigenvectors->matrix, dense, A->matrix_precision,
-1, sequential);
}

// transpose to satisfy column major ELPA convention
// (global matrix assumed symmetric, so no need for communications)
if (A->myprow != A->mypcol)
bml_transpose(amat);

REAL_T *z = bml_get_data_ptr(zmat);
assert(z != NULL);
REAL_T *a = bml_get_data_ptr(amat);
assert(a != NULL);

/* Solve EV problem */
// interface: see elpa_generic.h
// handle handle of the ELPA object, which defines the problem
// a device pointer to matrix a in GPU memory
// ev on return: pointer to eigenvalues in GPU memory
// q on return: pointer to eigenvectors in GPU memory
// error on return the error code, which can be queried with elpa_strerr()
LOG_DEBUG("Call ELPA eigensolver");
#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
float *ev;
magma_int_t ret = magma_smalloc(&ev, na);
#else
double *ev;
magma_int_t ret = magma_dmalloc(&ev, na);
#endif
assert(ret == MAGMA_SUCCESS);
#if defined(SINGLE_REAL)
elpa_eigenvectors_float(handle, a, ev, z, &error_elpa);
#endif
#if defined(DOUBLE_REAL)
elpa_eigenvectors_double(handle, a, ev, z, &error_elpa);
#endif
#if defined(SINGLE_COMPLEX)
elpa_eigenvectors_float_complex(handle, a, ev, z, &error_elpa);
#endif
#if defined(DOUBLE_COMPLEX)
elpa_eigenvectors_double_complex(handle, a, ev, z, &error_elpa);
#endif

assert(error_elpa == ELPA_OK);
// copy eigenvalues to CPU
LOG_DEBUG("copy eigenvalues to CPU");
#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
float *tmp = malloc(na * sizeof(float));
magma_sgetvector(na, ev, 1, tmp, 1, bml_queue());
#endif
#if defined(DOUBLE_REAL) || defined(DOUBLE_COMPLEX)
double *tmp = malloc(na * sizeof(double));
magma_dgetvector(na, ev, 1, tmp, 1, bml_queue());
#endif
magma_queue_sync(bml_queue());

REAL_T *ev_ptr = eigenvalues;
for (int i = 0; i < A->N; i++)
ev_ptr[i] = (REAL_T) tmp[i];
free(tmp);

magma_free(ev);

bml_deallocate(&amat);
if (bml_get_type(Alocal) != dense)
{
bml_deallocate(&(eigenvectors->matrix));
eigenvectors->matrix =
bml_convert(zmat, bml_get_type(Alocal), A->matrix_precision,
A->M / A->npcols, sequential);
bml_deallocate(&zmat);
}

elpa_deallocate(handle, &error_elpa);
}
#endif

void TYPED_FUNC(
bml_diagonalize_distributed2d) (
bml_matrix_distributed2d_t * A,
void *eigenvalues,
bml_matrix_distributed2d_t * eigenvectors)
{
#ifdef BML_USE_ELPA
TYPED_FUNC(bml_diagonalize_distributed2d_elpa) (A, eigenvalues,
eigenvectors);
#else
#ifdef BML_USE_SCALAPACK
TYPED_FUNC(bml_diagonalize_distributed2d_scalapack) (A, eigenvalues,
eigenvectors);
#else
LOG_ERROR
("Build with ScaLAPACK required for distributed2d diagonalization\n");
#endif
return;
#endif
// transpose eigenvectors to have them stored row-major
bml_transpose(eigenvectors->matrix);
}
2 changes: 1 addition & 1 deletion tests/C-tests/bml_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ main(
MPI_Init(&argc, &argv);
bml_init(MPI_COMM_WORLD);
printf("with MPI\n");
int N = 14;
int N = 64;
#else
bml_init();
int N = 13;
Expand Down

0 comments on commit 69794af

Please sign in to comment.