From 69794af912ee563aa11ed1f5e49ff9bdd5962559 Mon Sep 17 00:00:00 2001
From: Jean-Luc Fattebert <fattebertj@ornl.gov>
Date: Fri, 9 Sep 2022 15:19:43 -0400
Subject: [PATCH] Couple with ELPA

---
 CMakeLists.txt                                |  19 ++
 build.sh                                      |   6 +
 cmake/FindELPA.cmake                          |  15 ++
 scripts/build_olcf_summit_gnu_elpa_essl.sh    |  51 +++++
 .../bml_diagonalize_distributed2d.h           |  12 ++
 .../bml_diagonalize_distributed2d_typed.c     | 204 +++++++++++++++++-
 tests/C-tests/bml_test.c                      |   2 +-
 7 files changed, 303 insertions(+), 6 deletions(-)
 create mode 100644 cmake/FindELPA.cmake
 create mode 100755 scripts/build_olcf_summit_gnu_elpa_essl.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c1fee275..a2f2492fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -382,6 +382,25 @@ if(BML_MAGMA)
   endif()
 endif()
 
+set(BML_ELPA FALSE CACHE BOOL "Whether to use ELPA library")
+if(BML_ELPA)
+  message(STATUS "Search for ELPA in directory ${ELPA_DIR}\n")
+  find_package(ELPA REQUIRED)
+
+  if(${ELPA_FOUND})
+    message(STATUS "ELPA was found:\n"
+      "  ELPA_INCLUDE_DIRS: ${ELPA_INCLUDE_DIRS}\n"
+      "  ELPA_LIBRARY_DIRS: ${ELPA_LIBRARY_DIRS}\n"
+      "  ELPA_LIBRARIES: ${ELPA_LIBRARIES}"
+    )
+    add_definitions(-DBML_USE_ELPA)
+    include_directories(${ELPA_INCLUDE_DIRS})
+    link_directories(${CUDAToolkit_LIBRARY_DIR})
+    link_directories(${ELPA_LIBRARY_DIRS})
+    list(APPEND LINK_LIBRARIES ${ELPA_LIBRARIES})
+  endif()
+endif()
+
 set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library")
 if(BML_SCALAPACK)
   add_definitions(-DBML_USE_SCALAPACK)
diff --git a/build.sh b/build.sh
index ac01ab6e7..b3b336cc8 100755
--- a/build.sh
+++ b/build.sh
@@ -94,6 +94,8 @@ EOF
     echo "BML_XSMM                   Build with XSMM             (default is ${BML_XSMM})"
     echo "BML_SCALAPACK              Build with SCALAPACK        (default is ${BML_SCALAPACK})"
     echo "SCALAPACK_LIBRARIES        ScaLapack libraries         (default is '${SCALAPACK_LIBRARIES}')"
+    echo "BML_ELPA                   Build with ELPA             (default is ${BML_ELPA})"
+    echo "ELPA_DIR                   ELPA directory              (default is ${ELPA_DIR})"
     echo "BML_ELLBLOCK_MEMPOOL       Use ellblock memory pool    (default is ${BML_ELLBLOCK_MEMPOOL}"
     echo "CUDA_TOOLKIT_ROOT_DIR      Path to CUDA dir            (default is '${CUDA_TOOLKIT_ROOT_DIR}')"
     echo "INTEL_OPT                  {yes, no}                   (default is ${INTEL_OPT})"
@@ -125,6 +127,8 @@ set_defaults() {
     : ${BLAS_LIBRARIES:=}
     : ${LAPACK_LIBRARIES:=}
     : ${SCALAPACK_LIBRARIES:=}
+    : ${BML_ELPA:=no}
+    : ${ELPA_DIR:=}
     : ${BML_TESTING:=yes}
     : ${BML_VALGRIND:=no}
     : ${BML_COVERAGE:=no}
@@ -214,6 +218,7 @@ configure() {
         -DCMAKE_C_COMPILER="${CC}" \
         -DCMAKE_CXX_COMPILER="${CXX}" \
         -DCMAKE_Fortran_COMPILER="${FC}" \
+        -DCMAKE_PREFIX_PATH="${ELPA_DIR}" \
         ${CMAKE_C_FLAGS:+-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}"} \
         ${CMAKE_CXX_FLAGS:+-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}"} \
         ${CMAKE_Fortran_FLAGS:+-DCMAKE_Fortran_FLAGS="${CMAKE_Fortran_FLAGS}"} \
@@ -221,6 +226,7 @@ configure() {
         -DBLAS_LIBRARIES="${BLAS_LIBRARIES}" \
         -DLAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \
         -DSCALAPACK_LIBRARIES="${SCALAPACK_LIBRARIES}" \
+        -DBML_ELPA="${BML_ELPA}" \
         -DBML_OPENMP="${BML_OPENMP}" \
         -DMKL_GPU="${MKL_GPU}" \
         -DBML_MPI="${BML_MPI}" \
diff --git a/cmake/FindELPA.cmake b/cmake/FindELPA.cmake
new file mode 100644
index 000000000..1ec2bace2
--- /dev/null
+++ b/cmake/FindELPA.cmake
@@ -0,0 +1,15 @@
+# - Find the ELPA library
+#
+# Usage:
+#   find_package(ELPA [REQUIRED] [QUIET] )
+#
+# It sets the following variables:
+#   ELPA_FOUND               ... true if elpa is found on the system
+#   ELPA_LIBRARY_DIRS        ... full path to elpa library
+#   ELPA_INCLUDE_DIRS        ... elpa include directory
+#   ELPA_LIBRARIES           ... elpa libraries
+
+
+find_package(PkgConfig REQUIRED)
+pkg_check_modules(ELPA REQUIRED elpa IMPORTED_TARGET)
+
diff --git a/scripts/build_olcf_summit_gnu_elpa_essl.sh b/scripts/build_olcf_summit_gnu_elpa_essl.sh
new file mode 100755
index 000000000..182fade14
--- /dev/null
+++ b/scripts/build_olcf_summit_gnu_elpa_essl.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+module load cmake
+module load cuda
+module load gcc/10.2.0
+module load essl
+module load magma
+module load netlib-scalapack
+module load netlib-lapack
+
+rm -r build
+rm -r install
+
+MY_PATH=$(pwd)
+
+#get jsrun with full path
+JSRUN=$(which jsrun)
+echo ${JSRUN}
+
+export MAGMA_ROOT=${OLCF_MAGMA_ROOT:="${OLCF_MAGMA_ROOT}"}
+export CC=${CC:=mpicc}
+export FC=${FC:=mpif90}
+export CXX=${CXX:=mpiCC}
+export BML_OPENMP=${BML_OPENMP:=yes}
+export BML_MPI=${BML_MPI:=yes}
+export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=no}
+export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
+export BML_TESTING=${BML_TESTING:=yes}
+export BML_MAGMA=${BML_MAGMA:=yes}
+export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Debug}
+
+#set BLAS explicitly, otherwise cmake will pick the serial version of essl
+export BLAS_LIBRARIES=${BLAS_LIBRARIES:="$OLCF_ESSL_ROOT/lib64/libesslsmp.so"}
+#since essl does not contain all the lapack functions needed, we still need lapack
+export LAPACK_LIBRARIES=${LAPACK_LIBRARIES:="$OLCF_NETLIB_LAPACK_ROOT/lib64/liblapack.so"}
+export BML_SCALAPACK=${BML_SCALAPACK:=yes}
+export SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES:="-L$OLCF_NETLIB_SCALAPACK_ROOT/lib -lscalapack"}
+
+export BML_CUDA=${BML_CUDA:=yes}
+export BML_ELPA=${BML_ELPA:=yes}
+export ELPA_DIR=${ELPA_DIR:=/ccs/proj/csc304/elpa}
+export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lgfortran"}
+
+#use jsrun to run tests on a compute node
+export BML_NONMPI_PRECOMMAND=${BML_NONMPI_PRECOMMAND:=${JSRUN}}
+export BML_NONMPI_PRECOMMAND_ARGS=${BML_NONMPI_PRECOMMAND_ARGS:="-n1;-a1;-g1;-c7;--smpiargs=off"}
+
+export BML_MPIEXEC_EXECUTABLE=${BML_MPIEXEC_EXECUTABLE:=${JSRUN}}
+export BML_MPIEXEC_NUMPROCS_FLAG=${BML_MPIEXEC_NUMPROCS_FLAG:="-n"}
+export BML_MPIEXEC_PREFLAGS=${BML_MPIEXEC_PREFLAGS:="-a1;-c4;-bpacked:2;-g1"}
+
+./build.sh install
diff --git a/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h b/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h
index e3f14e41f..a3055e2cd 100644
--- a/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h
+++ b/src/C-interface/distributed2d/bml_diagonalize_distributed2d.h
@@ -28,4 +28,16 @@ void bml_diagonalize_distributed2d_double_complex(
     void *eigenvalues,
     bml_matrix_distributed2d_t * eigenvectors);
 
+#ifdef BML_USE_ELPA
+void bml_diagonalize_distributed2d_elpa_single_real(
+    bml_matrix_distributed2d_t * A,
+    void *eigenvalues,
+    bml_matrix_distributed2d_t * eigenvectors);
+
+void bml_diagonalize_distributed2d_elpa_double_real(
+    bml_matrix_distributed2d_t * A,
+    void *eigenvalues,
+    bml_matrix_distributed2d_t * eigenvectors);
+#endif
+
 #endif
diff --git a/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c b/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c
index 209765e54..6d008c8d9 100644
--- a/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c
+++ b/src/C-interface/distributed2d/bml_diagonalize_distributed2d_typed.c
@@ -14,6 +14,16 @@
 #include "../bml_transpose.h"
 #include "../bml_copy.h"
 
+#ifdef BML_USE_ELPA
+#include <elpa/elpa.h>
+#include <elpa/elpa_generic.h>
+#include "../dense/bml_allocate_dense.h"
+#ifdef BML_USE_MAGMA
+#include "../../typed.h"
+#include "magma_v2.h"
+#endif
+#endif
+
 #include <mpi.h>
 
 #include <complex.h>
@@ -139,13 +149,13 @@ void PZHEEVD(
  *  \param eigenvalues Eigenvalues of A
  *  \param eigenvectors Eigenvectors of A
  */
+#ifdef BML_USE_SCALAPACK
 void TYPED_FUNC(
-    bml_diagonalize_distributed2d) (
+    bml_diagonalize_distributed2d_scalapack) (
     bml_matrix_distributed2d_t * A,
     void *eigenvalues,
     bml_matrix_distributed2d_t * eigenvectors)
 {
-#ifdef BML_USE_SCALAPACK
     REAL_T *typed_eigenvalues = (REAL_T *) eigenvalues;
     // distributed2d format uses a row block distribution
     char order = 'R';
@@ -288,11 +298,195 @@ void TYPED_FUNC(
                         A->M / A->npcols, sequential);
         bml_deallocate(&zmat);
     }
-    // transpose eigenvectors to have them stored row-major
-    bml_transpose(eigenvectors->matrix);
+    return;
+}
+#endif
+
+#ifdef BML_USE_ELPA
+// Yu, V.; Moussa, J.; Kus, P.; Marek, A.; Messmer, P.; Yoon, M.; Lederer, H.; Blum, V.
+// "GPU-Acceleration of the ELPA2 Distributed Eigensolver for Dense Symmetric and Hermitian Eigenproblems",
+// Computer Physics Communications, 262, 2021
+void TYPED_FUNC(
+    bml_diagonalize_distributed2d_elpa) (
+    bml_matrix_distributed2d_t * A,
+    void *eigenvalues,
+    bml_matrix_distributed2d_t * eigenvectors)
+{
+    char order = 'R';
+    int np_rows = A->nprows;
+    int np_cols = A->npcols;
+    int my_prow = A->myprow;
+    int my_pcol = A->mypcol;
+    int my_blacs_ctxt = Csys2blacs_handle(A->comm);
+    Cblacs_gridinit(&my_blacs_ctxt, &order, np_rows, np_cols);
+    Cblacs_gridinfo(my_blacs_ctxt, &np_rows, &np_cols, &my_prow, &my_pcol);
+
+    int na = A->N;
+    int na_rows = na / np_rows;
+    int na_cols = na / np_cols;
+    if (na_rows * np_rows != na)
+    {
+        LOG_ERROR("Number of MPI tasks/row should divide matrix size\n");
+    }
+    //printf("Matrix size: %d\n", na);
+    //printf("Number of MPI process rows: %d\n", np_rows);
+    //printf("Number of MPI process cols: %d\n", np_cols);
+
+    if (elpa_init(ELPA_API_VERSION) != ELPA_OK)
+    {
+        LOG_ERROR("Error: ELPA API version not supported");
+    }
+
+    int error_elpa;
+    elpa_t handle = elpa_allocate(&error_elpa);
+    /* Set parameters */
+    elpa_set(handle, "na", (int) na, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "nev", (int) na, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "local_nrows", (int) na_rows, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "local_ncols", (int) na_cols, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    // use one block/MPI task, so sets block size to no. local rows
+    elpa_set(handle, "nblk", (int) na_rows, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "mpi_comm_parent", (int) (MPI_Comm_c2f(A->comm)),
+             &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "process_row", (int) my_prow, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "process_col", (int) my_pcol, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    int success = elpa_setup(handle);
+    assert(success == ELPA_OK);
+
+    elpa_set(handle, "solver", ELPA_SOLVER_2STAGE, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    elpa_set(handle, "gpu", 1, &error_elpa);
+    assert(error_elpa == ELPA_OK);
+
+    bml_matrix_t *Alocal = A->matrix;
+
+    bml_matrix_t *zmat = NULL;
+    bml_matrix_t *amat = NULL;
+    if (bml_get_type(Alocal) == dense)
+    {
+        amat = bml_copy_new(Alocal);
+        zmat = eigenvectors->matrix;
+    }
+    else
+    {
+        LOG_INFO("WARNING: convert local matrices to dense...\n");
+        // convert local matrix to dense
+        amat = bml_convert(Alocal, dense, A->matrix_precision,
+                           -1, sequential);
+        zmat = bml_convert(eigenvectors->matrix, dense, A->matrix_precision,
+                           -1, sequential);
+    }
+
+    // transpose to satisfy column major ELPA convention
+    // (global matrix assumed symmetric, so no need for communications)
+    if (A->myprow != A->mypcol)
+        bml_transpose(amat);
+
+    REAL_T *z = bml_get_data_ptr(zmat);
+    assert(z != NULL);
+    REAL_T *a = bml_get_data_ptr(amat);
+    assert(a != NULL);
+
+    /* Solve EV problem */
+    // interface: see elpa_generic.h
+    // handle  handle of the ELPA object, which defines the problem
+    // a       device pointer to matrix a in GPU memory
+    // ev      on return: pointer to eigenvalues in GPU memory
+    // q       on return: pointer to eigenvectors in GPU memory
+    // error   on return the error code, which can be queried with elpa_strerr()
+    LOG_DEBUG("Call ELPA eigensolver");
+#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
+    float *ev;
+    magma_int_t ret = magma_smalloc(&ev, na);
+#else
+    double *ev;
+    magma_int_t ret = magma_dmalloc(&ev, na);
+#endif
+    assert(ret == MAGMA_SUCCESS);
+#if defined(SINGLE_REAL)
+    elpa_eigenvectors_float(handle, a, ev, z, &error_elpa);
+#endif
+#if defined(DOUBLE_REAL)
+    elpa_eigenvectors_double(handle, a, ev, z, &error_elpa);
+#endif
+#if defined(SINGLE_COMPLEX)
+    elpa_eigenvectors_float_complex(handle, a, ev, z, &error_elpa);
+#endif
+#if defined(DOUBLE_COMPLEX)
+    elpa_eigenvectors_double_complex(handle, a, ev, z, &error_elpa);
+#endif
+
+    assert(error_elpa == ELPA_OK);
+    // copy eigenvalues to CPU
+    LOG_DEBUG("copy eigenvalues to CPU");
+#if defined(SINGLE_REAL) || defined(SINGLE_COMPLEX)
+    float *tmp = malloc(na * sizeof(float));
+    magma_sgetvector(na, ev, 1, tmp, 1, bml_queue());
+#endif
+#if defined(DOUBLE_REAL) || defined(DOUBLE_COMPLEX)
+    double *tmp = malloc(na * sizeof(double));
+    magma_dgetvector(na, ev, 1, tmp, 1, bml_queue());
+#endif
+    magma_queue_sync(bml_queue());
+
+    REAL_T *ev_ptr = eigenvalues;
+    for (int i = 0; i < A->N; i++)
+        ev_ptr[i] = (REAL_T) tmp[i];
+    free(tmp);
+
+    magma_free(ev);
+
+    bml_deallocate(&amat);
+    if (bml_get_type(Alocal) != dense)
+    {
+        bml_deallocate(&(eigenvectors->matrix));
+        eigenvectors->matrix =
+            bml_convert(zmat, bml_get_type(Alocal), A->matrix_precision,
+                        A->M / A->npcols, sequential);
+        bml_deallocate(&zmat);
+    }
+
+    elpa_deallocate(handle, &error_elpa);
+}
+#endif
+
+void TYPED_FUNC(
+    bml_diagonalize_distributed2d) (
+    bml_matrix_distributed2d_t * A,
+    void *eigenvalues,
+    bml_matrix_distributed2d_t * eigenvectors)
+{
+#ifdef BML_USE_ELPA
+    TYPED_FUNC(bml_diagonalize_distributed2d_elpa) (A, eigenvalues,
+                                                    eigenvectors);
+#else
+#ifdef BML_USE_SCALAPACK
+    TYPED_FUNC(bml_diagonalize_distributed2d_scalapack) (A, eigenvalues,
+                                                         eigenvectors);
 #else
     LOG_ERROR
         ("Build with ScaLAPACK required for distributed2d diagonalization\n");
 #endif
-    return;
+#endif
+    // transpose eigenvectors to have them stored row-major
+    bml_transpose(eigenvectors->matrix);
 }
diff --git a/tests/C-tests/bml_test.c b/tests/C-tests/bml_test.c
index 98b917627..999bf5648 100644
--- a/tests/C-tests/bml_test.c
+++ b/tests/C-tests/bml_test.c
@@ -187,7 +187,7 @@ main(
     MPI_Init(&argc, &argv);
     bml_init(MPI_COMM_WORLD);
     printf("with MPI\n");
-    int N = 14;
+    int N = 64;
 #else
     bml_init();
     int N = 13;