Skip to content

Commit

Permalink
Offload mods for crusher development
Browse files Browse the repository at this point in the history
  o Modified CMake scripts:
    - Use BML_OMP_OFFLOAD for both NVIDIA and AMD,
      needed due to commit #8a7df493
    - Use FindCUDAToolkit module instead of depracated FindCUDA
    - Update to CMake 3.17 version, to support FindCUDAToolkit
    - Consolidated the logic for CUDA, HIP, and associated libraries
      for various types of device builds under control of BML_USE_DEVICE
    - Added BML_OFFLOAD_ARCH with options NVIDIA and AMD

  o Change crusher and spock build scripts accordingly

  o Modified offload regions to address bml_multiply_x2()
    fortran test failure (hang)
    - Move temporary working arrays all_ix, all_jx, and all_x from stack to heap
    - This eliminated the hang, although it's not really clear why
    - Similar changes made to other add, multiply offload regions
  • Loading branch information
mewall committed Jun 23, 2022
1 parent 1f5d51d commit dd9b44d
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 110 deletions.
184 changes: 103 additions & 81 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.10)
cmake_minimum_required(VERSION 3.17)

message(STATUS "CMake version ${CMAKE_VERSION}")

Expand Down Expand Up @@ -76,9 +76,28 @@ set(XL_Fortran_FLAGS_DEBUG -O0 -g)
set(XL_Fortran_FLAGS_RELEASE -O3 -g)
set(XL_Fortran_FLAGS_RELWITHDEBINFO -O3 -g -DNDEBUG)

set(Clang_C_FLAGS_DEBUG -O0 -g -save-temps -std=c99)
set(Clang_C_FLAGS_RELEASE -O2 -g -std=c99 -DNDEBUG)
set(Clang_C_FLAGS_RELWITHDEBINFO -O2 -g -std=c99 -DNDEBUG)
if(DEFINED ENV{CRAYPE_VERSION})
set(Clang_C_FLAGS_DEBUG -O0 -g -save-temps -std=c99 -DCRAY_SDK -DNDEBUG)
set(Clang_C_FLAGS_RELEASE -O2 -g -std=c99 -DCRAY_SDK)
set(Clang_C_FLAGS_RELWITHDEBINFO -O2 -g -std=c99 -DNDEBUG -DCRAY_SDK)
set(Clang_Fortran_FLAGS_DEBUG -ef -DNDEBUG -DCRAY_SDK)
set(Clang_Fortran_FLAGS_RELEASE -ef -DCRAY_SDK)
set(Clang_Fortran_FLAGS_RELWITHDEBINFO -DNDEBUG -ef -DCRAY_SDK)
else()
set(Clang_C_FLAGS_DEBUG -O0 -g -save-temps -std=c99)
set(Clang_C_FLAGS_RELEASE -O2 -g -std=c99 -DNDEBUG)
set(Clang_C_FLAGS_RELWITHDEBINFO -O2 -g -std=c99 -DNDEBUG)
set(Clang_Fortran_FLAGS_DEBUG -DNDEBUG)
set(Clang_Fortran_FLAGS_RELEASE "")
set(Clang_Fortran_FLAGS_RELWITHDEBINFO -DNDEBUG)
endif()

set(Cray_C_FLAGS_DEBUG -O0 -g -save-temps -std=c99 -DCRAY_SDK -DNDEBUG)
set(Cray_C_FLAGS_RELEASE -O2 -g -std=c99 -DCRAY_SDK)
set(Cray_C_FLAGS_RELWITHDEBINFO -O2 -g -std=c99 -DNDEBUG -DCRAY_SDK)
set(Cray_Fortran_FLAGS_DEBUG -ef -DNDEBUG -DCRAY_SDK)
set(Cray_Fortran_FLAGS_RELEASE -ef -DCRAY_SDK)
set(Cray_Fortran_FLAGS_RELWITHDEBINFO -DNDEBUG -ef -DCRAY_SDK)

set(MALLOC_ALIGNMENT 64 CACHE INT "Alignment boundary for memory allocations")
add_definitions(-DMALLOC_ALIGNMENT=${MALLOC_ALIGNMENT})
Expand All @@ -92,6 +111,7 @@ if(CMAKE_BUILD_TYPE_UPPER AND NOT DONT_TOUCH_MY_FLAGS)
if(CMAKE_C_COMPILER_ID STREQUAL "GNU"
OR CMAKE_C_COMPILER_ID STREQUAL "Intel"
OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
OR CMAKE_C_COMPILER_ID STREQUAL "Cray"
OR CMAKE_C_COMPILER_ID STREQUAL "XL")
set(CMAKE_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}
${${CMAKE_C_COMPILER_ID}_C_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
Expand All @@ -116,6 +136,8 @@ if(CMAKE_BUILD_TYPE_UPPER AND NOT DONT_TOUCH_MY_FLAGS)
if(CMAKE_Fortran_FLAGS STREQUAL "")
if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Clang"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "Cray"
OR CMAKE_Fortran_COMPILER_ID STREQUAL "XL")
set(CMAKE_Fortran_FLAGS_${CMAKE_BUILD_TYPE_UPPER}
${${CMAKE_Fortran_COMPILER_ID}_Fortran_FLAGS_${CMAKE_BUILD_TYPE_UPPER}})
Expand Down Expand Up @@ -205,20 +227,50 @@ if(BML_MPI)
add_definitions(-DDO_MPI)
endif()

option(BML_OMP_OFFLOAD "Compile with OpenMP GPU Offload support" FALSE)
include(FindCUDAToolkit)
if(CUDAToolkit_FOUND)
message("Found CUDAToolkit: ${CUDAToolkit_TARGET_DIR}")
endif()

find_package(hip QUIET)

set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support")
set(BML_OFFLOAD_ARCH "NVIDIA" CACHE STRING "Offload architecture")
set_property(CACHE BML_OFFLOAD_ARCH PROPERTY STRINGS "NVIDIA" "AMD")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA" AND NOT CUDAToolkit_FOUND)
message("Offload to NVIDIA selected but CUDA not found. Disabling offload.")
set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support" FORCE)
endif()
if(BML_OFFLOAD_ARCH STREQUAL "AMD" AND NOT hip_FOUND)
message("Offload to AMD selected but HIP not found. Disabling offload.")
set(BML_OMP_OFFLOAD OFF CACHE BOOL "Compile with OpenMP GPU Offload support" FORCE)
endif()

if(BML_OMP_OFFLOAD)
message(STATUS "Will build with OpenMP GPU Offload support")

add_definitions(-DUSE_OMP_OFFLOAD)

find_package(OpenMP 4.5 REQUIRED)

include(CheckCXXCompilerFlag)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload=-misa=sm_35 -foffload=nvptx-none -foffload='-lm'")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload=-misa=sm_35 -foffload=nvptx-none -foffload='-lm'")
endif()
elseif(MAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(OpenMP_CXX_OFFLOAD_FLAG "-foffload='-lm'")
elseif(MAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(OpenMP_CXX_OFFLOAD_FLAG "-fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
if(BML_OFFLOAD_ARCH STREQUAL "NVIDIA")
set(OpenMP_CXX_OFFLOAD_FLAG "-O2 -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=${CUDAToolkit_TARGET_DIR}")
elseif(BML_OFFLOAD_ARCH STREQUAL "AMD")
if(DEFINED ENV{CRAYPE_VERSION})
set(OpenMP_CXX_OFFLOAD_FLAG "")
else()
message("BML_OFFLOAD_ARCH is AMD but Cray compiler not detected")
message(" ==> No offload flags added")
endif()
endif()
elseif(MAKE_CXX_COMPILER_ID STREQUAL "XL")
set(OpenMP_CXX_OFFLOAD_FLAG "-qsmp=omp -qoffload")
endif()
Expand All @@ -232,60 +284,50 @@ if(BML_OMP_OFFLOAD)

# list(APPEND LANGUAGES CUDA)

set(CUDA_FIND_REQUIRED TRUE)
include(FindCUDA)
if(BML_OFFLOAD_ARCH EQUAL "NVIDIA")

set(GPU_ARCH "sm_60" CACHE STRING "LAMMPS GPU CUDA SM architecture")
set_property(CACHE GPU_ARCH PROPERTY STRINGS sm_50 sm_60 sm_70)
set(GPU_ARCH "sm_60" CACHE STRING "LAMMPS GPU CUDA SM architecture")
set_property(CACHE GPU_ARCH PROPERTY STRINGS sm_50 sm_60 sm_70)

message("CUDA libraries: ${CUDA_LIBRARIES}")
message("CUDA CUBLAS libraries: ${CUDA_CUBLAS_LIBRARIES}")
endif()
endif()

set(BML_CUSPARSE FALSE CACHE BOOL "Whether to compile with CuSPARSE support")
set(BML_CUDA FALSE CACHE BOOL "Whether to compile with CUDA support")
if(BML_CUDA OR BML_CUSOLVER OR BML_CUSPARSE)
include(FindCUDA)
endif()
if (CUDA_FOUND)
message("CUDA CUBLAS libraries: ${CUDA_CUBLAS_LIBRARIES}")
message("CUDA libraries: ${CUDA_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${CUDA_LIBRARIES})
include_directories(${CUDA_INCLUDE_DIRS})
endif()

if(BML_CUSPARSE)
message(STATUS "Will use cuSPARSE")
add_definitions(-DBML_USE_CUSPARSE)
message("cuSPARSE library: ${CUDA_cusparse_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cusparse_LIBRARY})
endif()

if(BML_MAGMA)
find_package(hip)
endif()

if(BML_ROCSOLVER)
find_package(rocblas REQUIRED)
find_package(rocsolver REQUIRED)
endif()

if (hip_FOUND)
message("HIP libraries: ${hip_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${hip_LIBRARIES})
include_directories(${hip_INCLUDE_DIRS})
endif()

if (rocblas_FOUND)
message("ROCBLAS libraries: ${rocblas_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocblas_LIBRARIES})
include_directories(${rocblas_INCLUDE_DIRS})
endif()

if (rocsolver_FOUND)
message("ROCSOLVER libraries: ${rocsolver_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocsolver_LIBRARIES})
include_directories(${rocsolver_INCLUDE_DIRS})
set(BML_MAGMA FALSE CACHE BOOL "Whether to use MAGMA library")
set(BML_USE_DEVICE (BML_CUDA OR BML_MAGMA OR BML_CUSOLVER OR BML_ROCSOLVER OR BML_CUSPARSE OR BML_ROCSPARSE))
if(BML_USE_DEVICE)
if (CUDAToolkit_FOUND)
message("CUDA CUBLAS libraries: ${CUDA_cublas_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cublas_LIBRARY})
list(APPEND LINK_LIBRARIES ${CUDA_cudart_LIBRARY})
include_directories(${CUDAToolkit_INCLUDE_DIR})
if(BML_CUSOLVER)
message(STATUS "Will use cuSOLVER")
message("CUDA cuSOLVER library: ${CUDA_cusolver_LIBRARY}")
endif()
if(BML_CUSPARSE)
message(STATUS "Will use cuSPARSE")
add_definitions(-DBML_USE_CUSPARSE)
message("CUDA cuSPARSE library: ${CUDA_cusparse_LIBRARY}")
list(APPEND LINK_LIBRARIES ${CUDA_cusparse_LIBRARY})
endif()
elseif(hip_FOUND)
message("HIP libraries: ${hip_LIBRARIES}")
include_directories(${hip_INCLUDE_DIRS})
if(BML_ROCSOLVER)
message(STATUS "Will use rocSOLVER")
find_package(rocblas REQUIRED)
message("HIP rocBLAS libraries: ${rocblas_LIBRARIES}")
list(APPEND LINK_LIBRARIES ${rocblas_LIBRARIES})
include_directories(${rocblas_INCLUDE_DIRS})

find_package(rocsolver REQUIRED)
message("HIP rocSOLVER libraries: ${rocsolver_LIBRARIES}")
include_directories(${rocsolver_INCLUDE_DIRS})
endif()
list(APPEND LINK_LIBRARIES ${hip_LIBRARIES})
endif()
endif()

set(BML_OPENCL FALSE CACHE BOOL "Whether to compiler with OpenCL support")
Expand All @@ -295,7 +337,6 @@ endif()

SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

set(BML_MAGMA FALSE CACHE BOOL "Whether to use MAGMA library")

if(BML_MAGMA)
FIND_PACKAGE(MAGMA)
Expand All @@ -310,29 +351,16 @@ if (MAGMA_FOUND)
" MAGMA_LIBRARIES: ${MAGMA_LIBRARIES}"
)
include_directories(${MAGMA_INCLUDE_DIRS})
else()
message(STATUS
"MAGMA not found"
)
endif()

if (MAGMA_FOUND)
if(BML_CUSOLVER)
message(STATUS "Will use cuSOLVER")
add_definitions(-DBML_USE_CUSOLVER)
endif()
if(BML_ROCSOLVER)
message(STATUS "Will use rocSOLVER")
add_definitions(-DBML_USE_ROCSOLVER)
endif()
endif()

if (CUBLAS_FOUND)
list(APPEND LINK_LIBRARIES ${CUBLAS_LIBRARIES})
endif()

if (CUDA_FOUND)
list(APPEND LINK_LIBRARIES ${CUDA_LIBRARIES})
else()
message(STATUS
"MAGMA not found"
)
endif()

set(BML_SCALAPACK FALSE CACHE BOOL "Whether to use ScaLAPACK library")
Expand Down Expand Up @@ -381,14 +409,6 @@ if(NOT (LAPACK_FOUND OR NOBLAS OR MAGMA_FOUND))
endif()
endif()
message(STATUS "LAPACK_LIBRARIES=${LAPACK_LIBRARIES}")
if(MAGMA_FOUND)
if(BML_CUSOLVER)
message(STATUS "cuSOLVER: ${CUDA_cusolver_LIBRARY}")
endif()
if(BML_ROCSOLVER)
message(STATUS "rocSOLVER: ${ROCM_rocsolver_LIBRARIES}")
endif()
endif()

#check existence of required math and linear algebra functions
#in third party libraries
Expand Down Expand Up @@ -497,6 +517,8 @@ if(MAGMA_FOUND)
list(APPEND LINK_LIBRARIES ${MAGMA_LIBRARIES})
if(BML_CUSOLVER)
list(APPEND LINK_LIBRARIES ${CUDA_cusolver_LIBRARY})
elseif(BML_ROCSOLVER)
list(APPEND LINK_LIBRARIES ${rocsolver_LIBRARIES})
endif()
endif()
if(BML_SCALAPACK)
Expand Down
3 changes: 3 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ EOF
echo "EXTRA_FFLAGS Extra fortran flags (default is '${EXTRA_FFLAGS}')"
echo "EXTRA_LINK_FLAGS Add extra link flags (default is '${EXTRA_LINK_FLAGS}')"
echo "BML_OMP_OFFLOAD {yes,no} (default is ${BML_OMP_OFFLOAD})"
echo "BML_OFFLOAD_ARCH {NVIDIA, AMD} (default is ${BML_OFFLOAD_ARCH})"
echo "GPU_ARCH GPU architecture (default is ${GPU_ARCH})"
echo "BML_CUDA Build with CUDA (default is ${BML_CUDA})"
echo "BML_MAGMA Build with MAGMA (default is ${BML_MAGMA})"
Expand Down Expand Up @@ -127,6 +128,7 @@ set_defaults() {
: ${FORTRAN_FLAGS:=}
: ${EXTRA_LINK_FLAGS:=}
: ${BML_OMP_OFFLOAD:=no}
: ${BML_OFFLOAD_ARCH:=NVIDIA}
: ${GPU_ARCH:=}
: ${BML_CUDA:=no}
: ${BML_MAGMA:=no}
Expand Down Expand Up @@ -234,6 +236,7 @@ configure() {
${EXTRA_LINK_FLAGS:+-DBML_LINK_FLAGS="${EXTRA_LINK_FLAGS}"} \
-DCMAKE_VERBOSE_MAKEFILE=${VERBOSE_MAKEFILE} \
-DBML_OMP_OFFLOAD="${BML_OMP_OFFLOAD}" \
-DBML_OFFLOAD_ARCH="${BML_OFFLOAD_ARCH}" \
-DGPU_ARCH="${GPU_ARCH}" \
-DBML_CUDA="${BML_CUDA}" \
-DBML_MAGMA="${BML_MAGMA}" \
Expand Down
4 changes: 2 additions & 2 deletions scripts/build_crusher_offload_cce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLA_VENDOR=${BLA_VENDOR:=OpenBLAS}
export BML_OPENMP=${BML_OPENMP:=yes}
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes}
export BML_OFFLOAD_ARCH=${BML_OFFLOAD_ARCH:=AMD}
export BUILD_DIR=${BUILD_DIR:="${MY_PATH}/build"}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}

./build.sh configure

Expand Down
8 changes: 4 additions & 4 deletions scripts/build_spock_offload_cce.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ MY_PATH=$(pwd)
export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BLA_VENDOR=${BLA_VENDOR:=OpenBLAS}
export BML_OPENMP=${BML_OPENMP:=yes}
export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes}
export BML_OFFLOAD_ARCH=${BML_OFFLOAD_ARCH:=AMD}
export BML_COMPLEX=${BML_COMPLEX:=no}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=no}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"}

./build.sh configure

Expand Down
4 changes: 2 additions & 2 deletions scripts/setenv_spock_offload.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

module load craype-accel-amd-gfx908
module load rocm/4.1.0
module load rocm
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
module load openblas

22 changes: 11 additions & 11 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ void TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x;

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
all_ix = calloc(N * num_chunks, sizeof(int));
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks])

Expand Down Expand Up @@ -248,13 +248,13 @@ double TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks], all_y[N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x, *all_y;

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T));
all_ix = calloc(N * num_chunks, sizeof(int));
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));
all_y = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)

Expand Down
Loading

0 comments on commit dd9b44d

Please sign in to comment.