Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Offload mods for crusher development #633

Merged
merged 1 commit into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 86 additions & 78 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.10)
cmake_minimum_required(VERSION 3.17)
nicolasbock marked this conversation as resolved.
Show resolved Hide resolved

message(STATUS "CMake version ${CMAKE_VERSION}")

Expand Down Expand Up @@ -80,6 +80,11 @@ set(Clang_C_FLAGS_DEBUG -O0 -g -save-temps -std=c99)
set(Clang_C_FLAGS_RELEASE -O2 -g -std=c99 -DNDEBUG)
set(Clang_C_FLAGS_RELWITHDEBINFO -O2 -g -std=c99 -DNDEBUG)

if(DEFINED ENV{CRAYPE_VERSION})
jeanlucf22 marked this conversation as resolved.
Show resolved Hide resolved
set(EXTRA_CFLAGS "-DCRAY_SDK ${EXTRA_CFLAGS}")
set(EXTRA_FFLAGS "-ef -DCRAY_SDK ${EXTRA_FFLAGS}")
endif()

set(MALLOC_ALIGNMENT 64 CACHE INT "Alignment boundary for memory allocations")
add_definitions(-DMALLOC_ALIGNMENT=${MALLOC_ALIGNMENT})

Expand All @@ -92,6 +97,7 @@ if(CMAKE_BUILD_TYPE_UPPER AND NOT DONT_TOUCH_MY_FLAGS)
if(CMAKE_C_COMPILER_ID STREQUAL "GNU"
OR CMAKE_C_COMPILER_ID STREQUAL "Intel"
OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
OR CMAKE_C_COMPILER_ID STREQUAL "Cray"
OR CMAKE_C_COMPILER_ID STREQUAL "XL")
set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}
${${CMAKE_C_COMPILER_ID}_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
Expand All @@ -116,6 +122,8 @@ if(CMAKE_BUILD_TYPE_UPPER AND NOT DONT_TOUCH_MY_FLAGS)
if(CMAKE_Fortran_FLAGS STREQUAL "")
if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Clang"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Cray"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "XL")
set(CMAKE_Fortran_FLAGS_${CMAKE_BUILD_TYPE_UPPER}
${${CMAKE_Fortran_COMPILER_ID}_Fortran_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
Expand Down Expand Up @@ -205,20 +213,50 @@ if(BML_MPI)
add_definitions(-DDO_MPI)
endif()

option(BML_OMP_OFFLOAD "Compile with OpenMP GPU Offload support" FALSE)
include(FindCUDAToolkit)
if(CUDAToolkit_FOUND)
message("Found CUDAToolkit: ${CUDAToolkit_TARGET_DIR}")
endif()

find_package(hip QUIET)

set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support")
set(BML_OFFLOAD_ARCH "NVIDIA" CACHE STRING "Offload architecture")
set_property(CACHE BML_OFFLOAD_ARCH PROPERTY STRINGS "NVIDIA" "AMD")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA" AND NOT CUDAToolkit_FOUND)
message("Offload to NVIDIA selected but CUDA not found. Disabling offload.")
set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support" FORCE)
endif()
if(BML_OFFLOAD_ARCH STREQUAL "AMD" AND NOT hip_FOUND)
message("Offload to AMD selected but HIP not found. Disabling offload.")
set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support" FORCE)
endif()

if(BML_OMP_OFFLOAD)
message(STATUS "Will build with OpenMP GPU Offload support")

add_definitions(-DUSE_OMP_OFFLOAD)

find_package(OpenMP 4.5 REQUIRED)

include(CheckCXXCompilerFlag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload=-misa=sm_35 -foffload=nvptx-none -foffload='-lm'")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload=-misa=sm_35 -foffload=nvptx-none -foffload='-lm'")
endif()
elseif(MAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload='-lm'")
elseif(MAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(OpenMP_CXX_OFFLOAD_FLAG "-fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA")
set(OpenMP_CXX_OFFLOAD_FLAG "-O2 -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDAToolkit_TARGET_DIR}")
elseif(BML_OFFLOAD_ARCH STREQUAL "AMD")
if(DEFINED ENV{CRAYPE_VERSION})
set(OpenMP_CXX_OFFLOAD_FLAG "")
else()
message("BML_OFFLOAD_ARCH is AMD but Cray compiler not detected")
message(" ==> No offload flags added")
endif()
endif()
elseif(MAKE_CXX_COMPILER_ID STREQUAL "XL")
set(OpenMP_CXX_OFFLOAD_FLAG "-qsmp=omp -qoffload")
endif()
Expand All @@ -232,60 +270,50 @@ if(BML_OMP_OFFLOAD)

# list(APPEND LANGUAGES CUDA)

set(CUDA_FIND_REQUIRED TRUE)
include(FindCUDA)
if(BML_OFFLOAD_ARCH EQUAL "NVIDIA")

set(GPU_ARCH "sm_60" CACHE STRING "LAMMPS GPU CUDA SM architecture")
set_property(CACHE GPU_ARCH PROPERTY STRINGS sm_50 sm_60 sm_70)
set(GPU_ARCH "sm_60" CACHE STRING "LAMMPS GPU CUDA SM architecture")
set_property(CACHE GPU_ARCH PROPERTY STRINGS sm_50 sm_60 sm_70)

message("CUDA libraries: ${CUDA_LIBRARIES}")
message("CUDA CUBLAS libraries: ${CUDA_CUBLAS_LIBRARIES}")
endif()
endif()

set(BML_CUSPARSE FALSE CACHE BOOL "Whether to compile with CuSPARSE support")
set(BML_CUDA FALSE CACHE BOOL "Whether to compile with CUDA support")
if(BML_CUDA OR BML_CUSOLVER OR BML_CUSPARSE)
include(FindCUDA)
endif()
if (CUDA_FOUND)
message("CUDA CUBLAS libraries: ${CUDA_CUBLAS_LIBRARIES}")
message("CUDA libraries: ${CUDA_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${CUDA_LIBRARIES})
include_directories(${CUDA_INCLUDE_DIRS})
endif()

if(BML_CUSPARSE)
message(STATUS "Will use cuSPARSE")
add_definitions(-DBML_USE_CUSPARSE)
message("cuSPARSE library: ${CUDA_cusparse_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cusparse_LIBRARY})
endif()

if(BML_MAGMA)
find_package(hip)
endif()

if(BML_ROCSOLVER)
find_package(rocblas REQUIRED)
find_package(rocsolver REQUIRED)
endif()

if (hip_FOUND)
message("HIP libraries: ${hip_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${hip_LIBRARIES})
include_directories(${hip_INCLUDE_DIRS})
endif()

if (rocblas_FOUND)
message("ROCBLAS libraries: ${rocblas_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocblas_LIBRARIES})
include_directories(${rocblas_INCLUDE_DIRS})
endif()

if (rocsolver_FOUND)
message("ROCSOLVER libraries: ${rocsolver_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocsolver_LIBRARIES})
include_directories(${rocsolver_INCLUDE_DIRS})
set(BML_MAGMA FALSE CACHE BOOL "Whether to use MAGMA library")
set(BML_USE_DEVICE (BML_CUDA OR BML_MAGMA OR BML_CUSOLVER OR BML_ROCSOLVER OR BML_CUSPARSE OR BML_ROCSPARSE))
if(BML_USE_DEVICE)
if (CUDAToolkit_FOUND)
message("CUDA CUBLAS libraries: ${CUDA_cublas_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cublas_LIBRARY})
list(APPEND LINK_LIBRARIES ${CUDA_cudart_LIBRARY})
include_directories(${CUDAToolkit_INCLUDE_DIR})
if(BML_CUSOLVER)
message(STATUS "Will use cuSOLVER")
message("CUDA cuSOLVER library: ${CUDA_cusolver_LIBRARY}")
endif()
if(BML_CUSPARSE)
message(STATUS "Will use cuSPARSE")
add_definitions(-DBML_USE_CUSPARSE)
message("CUDA cuSPARSE library: ${CUDA_cusparse_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cusparse_LIBRARY})
endif()
elseif(hip_FOUND)
message("HIP libraries: ${hip_LIBRARIES}")
include_directories(${hip_INCLUDE_DIRS})
if(BML_ROCSOLVER)
message(STATUS "Will use rocSOLVER")
find_package(rocblas REQUIRED)
message("HIP rocBLAS libraries: ${rocblas_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocblas_LIBRARIES})
include_directories(${rocblas_INCLUDE_DIRS})

find_package(rocsolver REQUIRED)
message("HIP rocSOLVER libraries: ${rocsolver_LIBRARIES}")
include_directories(${rocsolver_INCLUDE_DIRS})
endif()
list(APPEND LINK_LIBRARIES ${hip_LIBRARIES})
endif()
endif()

set(BML_OPENCL FALSE CACHE BOOL "Whether to compiler with OpenCL support")
Expand All @@ -295,7 +323,6 @@ endif()

SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

set(BML_MAGMA FALSE CACHE BOOL "Whether to use MAGMA library")

if(BML_MAGMA)
FIND_PACKAGE(MAGMA)
Expand All @@ -310,29 +337,16 @@ if (MAGMA_FOUND)
" MAGMA_LIBRARIES: ${MAGMA_LIBRARIES}"
)
include_directories(${MAGMA_INCLUDE_DIRS})
else()
message(STATUS
"MAGMA not found"
)
endif()

if (MAGMA_FOUND)
if(BML_CUSOLVER)
message(STATUS "Will use cuSOLVER")
jeanlucf22 marked this conversation as resolved.
Show resolved Hide resolved
add_definitions(-DBML_USE_CUSOLVER)
endif()
if(BML_ROCSOLVER)
message(STATUS "Will use rocSOLVER")
jeanlucf22 marked this conversation as resolved.
Show resolved Hide resolved
add_definitions(-DBML_USE_ROCSOLVER)
endif()
endif()

if (CUBLAS_FOUND)
list(APPEND LINK_LIBRARIES ${CUBLAS_LIBRARIES})
endif()

if (CUDA_FOUND)
list(APPEND LINK_LIBRARIES ${CUDA_LIBRARIES})
else()
message(STATUS
"MAGMA not found"
)
endif()

set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library")
Expand Down Expand Up @@ -381,14 +395,6 @@ if(NOT (LAPACK_FOUND OR NOBLAS OR MAGMA_FOUND))
endif()
endif()
message(STATUS "LAPACK_LIBRARIES=${LAPACK_LIBRARIES}")
if(MAGMA_FOUND)
if(BML_CUSOLVER)
message(STATUS "cuSOLVER: ${CUDA_cusolver_LIBRARY}")
endif()
if(BML_ROCSOLVER)
message(STATUS "rocSOLVER: ${ROCM_rocsolver_LIBRARIES}")
endif()
endif()

#check existence of required math and linear algebra functions
#in third party libraries
Expand Down Expand Up @@ -497,6 +503,8 @@ if(MAGMA_FOUND)
list(APPEND LINK_LIBRARIES ${MAGMA_LIBRARIES})
if(BML_CUSOLVER)
list(APPEND LINK_LIBRARIES ${CUDA_cusolver_LIBRARY})
elseif(BML_ROCSOLVER)
list(APPEND LINK_LIBRARIES ${rocsolver_LIBRARIES})
endif()
endif()
if(BML_SCALAPACK)
Expand Down
3 changes: 3 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ EOF
echo "EXTRA_FFLAGS Extra fortran flags (default is '${EXTRA_FFLAGS}')"
echo "EXTRA_LINK_FLAGS Add extra link flags (default is '${EXTRA_LINK_FLAGS}')"
echo "BML_OMP_OFFLOAD {yes,no} (default is ${BML_OMP_OFFLOAD})"
echo "BML_OFFLOAD_ARCH {NVIDIA, AMD} (default is ${BML_OFFLOAD_ARCH})"
echo "GPU_ARCH GPU architecture (default is ${GPU_ARCH})"
echo "BML_CUDA Build with CUDA (default is ${BML_CUDA})"
echo "BML_MAGMA Build with MAGMA (default is ${BML_MAGMA})"
Expand Down Expand Up @@ -127,6 +128,7 @@ set_defaults() {
: ${FORTRAN_FLAGS:=}
: ${EXTRA_LINK_FLAGS:=}
: ${BML_OMP_OFFLOAD:=no}
: ${BML_OFFLOAD_ARCH:=NVIDIA}
: ${GPU_ARCH:=}
: ${BML_CUDA:=no}
: ${BML_MAGMA:=no}
Expand Down Expand Up @@ -234,6 +236,7 @@ configure() {
${EXTRA_LINK_FLAGS:+-DBML_LINK_FLAGS="${EXTRA_LINK_FLAGS}"} \
-DCMAKE_VERBOSE_MAKEFILE=${VERBOSE_MAKEFILE} \
-DBML_OMP_OFFLOAD="${BML_OMP_OFFLOAD}" \
-DBML_OFFLOAD_ARCH="${BML_OFFLOAD_ARCH}" \
-DGPU_ARCH="${GPU_ARCH}" \
-DBML_CUDA="${BML_CUDA}" \
-DBML_MAGMA="${BML_MAGMA}" \
Expand Down
4 changes: 2 additions & 2 deletions scripts/build_crusher_offload_cce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLA_VENDOR=${BLA_VENDOR:=OpenBLAS}
export BML_OPENMP=${BML_OPENMP:=yes}
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes}
export BML_OFFLOAD_ARCH=${BML_OFFLOAD_ARCH:=AMD}
export BUILD_DIR=${BUILD_DIR:="${MY_PATH}/build"}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}

./build.sh configure

Expand Down
8 changes: 4 additions & 4 deletions scripts/build_spock_offload_cce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ MY_PATH=$(pwd)
export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BLA_VENDOR=${BLA_VENDOR:=OpenBLAS}
export BML_OPENMP=${BML_OPENMP:=yes}
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes}
export BML_OFFLOAD_ARCH=${BML_OFFLOAD_ARCH:=AMD}
export BML_COMPLEX=${BML_COMPLEX:=no}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=no}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"}

./build.sh configure

Expand Down
4 changes: 2 additions & 2 deletions scripts/setenv_spock_offload.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

module load craype-accel-amd-gfx908
module load rocm/4.1.0
module load rocm
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
module load openblas

22 changes: 11 additions & 11 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ void TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x;

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
all_ix = calloc(N * num_chunks, sizeof(int));
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks])

Expand Down Expand Up @@ -248,13 +248,13 @@ double TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks], all_y[N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x, *all_y;

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T));
all_ix = calloc(N * num_chunks, sizeof(int));
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));
all_y = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)

Expand Down
20 changes: 10 additions & 10 deletions src/C-interface/ellpack/bml_multiply_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ void *TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[X_N * num_chunks], all_jx[X_N * num_chunks];
REAL_T all_x[X_N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x;

memset(all_ix, 0, X_N * num_chunks * sizeof(int));
memset(all_jx, 0, X_N * num_chunks * sizeof(int));
memset(all_x, 0.0, X_N * num_chunks * sizeof(REAL_T));
all_ix = calloc(X_N * num_chunks, sizeof(int));
all_jx = calloc(X_N * num_chunks, sizeof(int));
all_x = calloc(X_N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks])

Expand Down Expand Up @@ -374,12 +374,12 @@ void TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks];
REAL_T all_x[C_N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x;

memset(all_ix, 0, C_N * num_chunks * sizeof(int));
memset(all_jx, 0, C_N * num_chunks * sizeof(int));
memset(all_x, 0.0, C_N * num_chunks * sizeof(REAL_T));
all_ix = calloc(C_N * num_chunks, sizeof(int));
all_jx = calloc(C_N * num_chunks, sizeof(int));
all_x = calloc(C_N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])

Expand Down