diff --git a/scripts/build_lassen_xl_offload.sh b/scripts/build_lassen_xl_offload.sh new file mode 100755 index 000000000..78e3b9660 --- /dev/null +++ b/scripts/build_lassen_xl_offload.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Make sure all the paths are correct + +source setenv_lassen_offload.sh + +rm -r build +rm -r install + +MY_PATH=$(pwd) + +export CC=${CC:=xlc-gpu} +export FC=${FC:=xlf2003-gpu} +export CXX=${CXX:=xlc++-gpu} +export BLAS_VENDOR=${BLAS_VENDOR:=Auto} +export BML_OPENMP=${BML_OPENMP:=yes} +export BML_OMP_OFFLOAD=${BML_OMP_OFFLOAD:=yes} +export BML_CUSPARSE=${BML_CUSPARSE:=yes} +export BML_COMPLEX=${BML_COMPLEX:=no} +export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} +export BML_TESTING=${BML_TESTING:=yes} +export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release} +export EXTRA_CFLAGS=${EXTRA_CFLAGS:=""} +export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:=""} +#export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-lm -L/usr/tce/packages/xl/xl-2021.03.11/xlC/16.1.1/lib -libmc++"} +export BLAS_LIBRARIES=${BLAS_LIBRARIES:="-L${LAPACK_DIR} -llapack -lblas"} + +./build.sh configure + +pushd build +make -j +make install +popd + + + + diff --git a/scripts/setenv_lassen_offload.sh b/scripts/setenv_lassen_offload.sh new file mode 100644 index 000000000..012370189 --- /dev/null +++ b/scripts/setenv_lassen_offload.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +#module purge +module load cmake +module load xl/2021.03.11-cuda-11.2.0 +module load cuda/11.2.0 +module load lapack/3.9.0-xl-2020.11.12 +#module load essl +export CUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR="/usr/tce/packages/cuda/cuda-11.2.0"} + diff --git a/src/C-interface/ellblock/bml_add_ellblock_typed.c b/src/C-interface/ellblock/bml_add_ellblock_typed.c index d6014886e..64f0a2be4 100644 --- a/src/C-interface/ellblock/bml_add_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_add_ellblock_typed.c @@ -45,7 +45,6 @@ void TYPED_FUNC( int NB = A->NB; int MB = A->MB; - int ix[NB], jx[NB]; int *A_nnzb = A->nnzb; int *A_indexb = A->indexb; @@ -55,12 +54,16 @@ void TYPED_FUNC( int *bsize = A->bsize; - REAL_T *x_ptr[NB]; REAL_T **A_ptr_value = (REAL_T **) A->ptr_value; REAL_T **B_ptr_value = (REAL_T **) B->ptr_value; +#if !(defined(__IBMC__) || defined(__ibmxl__)) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + memset(ix, 0, NB * sizeof(int)); memset(jx, 0, NB * sizeof(int)); +#endif int maxbsize = 0; for (int ib = 0; ib < NB; ib++) @@ -74,13 +77,29 @@ void TYPED_FUNC( REAL_T *x_ptr_storage = calloc(maxbsize2 * NB * nthreads, sizeof(REAL_T)); char xptrset = 0; +#if defined(__IBMC__) || defined(__ibmxl__) +#pragma omp parallel for \ + shared(A_indexb, A_ptr_value, A_nnzb) \ + shared(B_indexb, B_ptr_value, B_nnzb) \ + shared(x_ptr_storage) \ + firstprivate(xptrset) +#else #pragma omp parallel for \ shared(A_indexb, A_ptr_value, A_nnzb) \ shared(B_indexb, B_ptr_value, B_nnzb) \ shared(x_ptr_storage) \ firstprivate(ix, jx, x_ptr, xptrset) +#endif for (int ib = 0; ib < NB; ib++) { + +#if defined(__IBMC__) || defined(__ibmxl__) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + + memset(ix, 0, NB * sizeof(int)); +#endif + if (!xptrset) { #ifdef _OPENMP diff --git a/src/C-interface/ellblock/bml_multiply_ellblock_typed.c b/src/C-interface/ellblock/bml_multiply_ellblock_typed.c index 140eb0e39..aa682f540 100644 --- a/src/C-interface/ellblock/bml_multiply_ellblock_typed.c +++ b/src/C-interface/ellblock/bml_multiply_ellblock_typed.c @@ -255,9 +255,6 @@ void *TYPED_FUNC( int *X2_indexb = X2->indexb; int *X2_nnzb = X2->nnzb; - int ix[NB], jx[NB]; - REAL_T *x_ptr[NB]; - REAL_T traceX = 0.0; REAL_T traceX2 = 0.0; REAL_T **X_ptr_value = (REAL_T **) X->ptr_value; @@ -265,8 +262,13 @@ void *TYPED_FUNC( double *trace = bml_allocate_memory(sizeof(double) * 2); +#if !(defined(__IBMC__) || defined(__ibmxl__)) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + memset(ix, 0, NB * sizeof(int)); memset(jx, 0, NB * sizeof(int)); +#endif int maxbsize = 0; for (int ib = 0; ib < NB; ib++) @@ -297,14 +299,26 @@ void *TYPED_FUNC( TYPED_FUNC(bml_multiply_block4), TYPED_FUNC(bml_multiply_block5), TYPED_FUNC(bml_multiply_block6)}; - +#if defined(__IBMC__) || defined(__ibmxl__) +#pragma omp parallel for \ + firstprivate(xptrset) \ + reduction(+: traceX, traceX2) +#else #pragma omp parallel for \ firstprivate(ix,jx, x_ptr, xptrset) \ reduction(+: traceX, traceX2) - +#endif //loop over row blocks for (int ib = 0; ib < NB; ib++) { + +#if defined(__IBMC__) || defined(__ibmxl__) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + + memset(ix, 0, NB * sizeof(int)); +#endif + int lb = 0; if (!xptrset) { @@ -456,15 +470,17 @@ void TYPED_FUNC( int *C_nnzb = C->nnzb; int *C_indexb = C->indexb; - int ix[NB], jx[NB]; - REAL_T *x_ptr[NB]; - REAL_T **A_ptr_value = (REAL_T **) A->ptr_value; REAL_T **B_ptr_value = (REAL_T **) B->ptr_value; REAL_T **C_ptr_value = (REAL_T **) C->ptr_value; +#if !(defined(__IBMC__) || defined(__ibmxl__)) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + memset(ix, 0, NB * sizeof(int)); memset(jx, 0, NB * sizeof(int)); +#endif int maxbsize = 0; for (int ib = 0; ib < NB; ib++) @@ -495,11 +511,24 @@ void TYPED_FUNC( TYPED_FUNC(bml_multiply_block5), TYPED_FUNC(bml_multiply_block6)}; //loop over row blocks +#if defined(__IBMC__) || defined(__ibmxl__) +#pragma omp parallel for \ + firstprivate( xptrset) +#else #pragma omp parallel for \ firstprivate(ix, jx, x_ptr, xptrset) +#endif for (int ib = 0; ib < NB; ib++) { + +#if defined(__IBMC__) || defined(__ibmxl__) + int ix[NB], jx[NB]; + REAL_T *x_ptr[NB]; + + memset(ix, 0, NB * sizeof(int)); +#endif + int lb = 0; if (!xptrset) { diff --git a/src/C-interface/ellpack/bml_add_ellpack_typed.c b/src/C-interface/ellpack/bml_add_ellpack_typed.c index 41ecf5c4c..11614c564 100644 --- a/src/C-interface/ellpack/bml_add_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_add_ellpack_typed.c @@ -66,7 +66,7 @@ void TYPED_FUNC( memset(x, 0.0, N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[N * num_chunks], all_jx[N * num_chunks]; @@ -81,7 +81,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ @@ -116,19 +116,20 @@ void TYPED_FUNC( firstprivate(ix, jx, x) #endif #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) + { #else for (int i = rowMin; i < rowMax; i++) -#endif { + #if defined(__IBMC__) || defined(__ibmxl__) int ix[N], jx[N]; REAL_T x[N]; memset(ix, 0, N * sizeof(int)); #endif - +#endif int l = 0; if (alpha > (double) 0.0 || alpha < (double) 0.0) for (int jp = 0; jp < A_nnz[i]; jp++) @@ -175,7 +176,7 @@ void TYPED_FUNC( } A_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif } @@ -234,7 +235,7 @@ double TYPED_FUNC( memset(y, 0.0, N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[N * num_chunks], all_jx[N * num_chunks]; @@ -250,7 +251,7 @@ double TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ @@ -291,11 +292,11 @@ double TYPED_FUNC( reduction(+:trnorm) #endif #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) + { #else for (int i = rowMin; i < rowMax; i++) -#endif { #if defined(__IBMC__) || defined(__ibmxl__) @@ -305,7 +306,7 @@ double TYPED_FUNC( memset(ix, 0, N * sizeof(int)); #endif - +#endif int l = 0; for (int jp = 0; jp < A_nnz[i]; jp++) { @@ -359,7 +360,7 @@ double TYPED_FUNC( } A_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif @@ -397,7 +398,7 @@ void TYPED_FUNC( memset(x, 0.0, A_M * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N); int all_jx[N * num_chunks]; @@ -411,7 +412,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(N, A_M) \ shared(A_index, A_value, A_nnz) @@ -441,18 +442,18 @@ void TYPED_FUNC( firstprivate(jx, x) #endif #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = chunk; i < N; i = i + num_chunks) + { #else for (int i = 0; i < N; i++) -#endif { #if defined(__IBMC__) || defined(__ibmxl__) int jx[A_M]; REAL_T x[A_M]; #endif - +#endif int l = 0; int diag = -1; @@ -495,7 +496,7 @@ void TYPED_FUNC( } A_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif } diff --git a/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c index 6911e4a7f..61f3c6fdd 100644 --- a/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c @@ -74,7 +74,7 @@ void TYPED_FUNC( memset(x, 0.0, C->N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks]; @@ -89,7 +89,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ shared(A_localRowMin, A_localRowMax) \ @@ -129,11 +129,11 @@ void TYPED_FUNC( #endif #endif //for (int i = 0; i < A_N; i++) -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) + { #else for (int i = rowMin; i < rowMax; i++) -#endif { #if defined(__IBMC__) || defined(__ibmxl__) int ix[C_N], jx[C_N]; @@ -141,7 +141,7 @@ void TYPED_FUNC( memset(ix, 0, C_N * sizeof(int)); #endif - +#endif int l = 0; for (int jp = 0; jp < A_nnz[i]; jp++) { @@ -198,7 +198,7 @@ void TYPED_FUNC( } C_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif } diff --git a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c index 9fc4873a0..c0d7f4178 100644 --- a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c @@ -137,7 +137,7 @@ void *TYPED_FUNC( memset(x, 0.0, X_N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[X_N * num_chunks], all_jx[X_N * num_chunks]; @@ -152,7 +152,7 @@ void *TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(X_N, X_M, X_index, X_nnz, X_value) \ shared(X2_N, X2_M, X2_index, X2_nnz, X2_value) \ @@ -195,11 +195,11 @@ void *TYPED_FUNC( #endif #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) + { #else for (int i = rowMin; i < rowMax; i++) -#endif { #if defined(__IBMC__) || defined(__ibmxl__) @@ -208,7 +208,7 @@ void *TYPED_FUNC( memset(ix, 0, X_N * sizeof(int)); #endif - +#endif #ifdef INTEL_OPT __assume_aligned(X_nnz, MALLOC_ALIGNMENT); __assume_aligned(X_index, MALLOC_ALIGNMENT); @@ -277,7 +277,7 @@ void *TYPED_FUNC( X2_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif @@ -339,7 +339,7 @@ void TYPED_FUNC( memset(x, 0.0, C->N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks]; @@ -354,7 +354,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) || defined(CRAY_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__) #pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ shared(A_localRowMin, A_localRowMax) \ @@ -394,11 +394,11 @@ void TYPED_FUNC( #endif #endif //for (int i = 0; i < A_N; i++) -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) + { #else for (int i = rowMin; i < rowMax; i++) -#endif { #if defined(__IBMC__) || defined(__ibmxl__) int ix[C_N], jx[C_N]; @@ -406,7 +406,7 @@ void TYPED_FUNC( memset(ix, 0, C_N * sizeof(int)); #endif - +#endif int l = 0; for (int jp = 0; jp < A_nnz[i]; jp++) { @@ -460,7 +460,7 @@ void TYPED_FUNC( } C_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)) } #endif } diff --git a/src/C-interface/ellpack/bml_normalize_ellpack.c b/src/C-interface/ellpack/bml_normalize_ellpack.c index 1dfc516f1..3b2929f03 100644 --- a/src/C-interface/ellpack/bml_normalize_ellpack.c +++ b/src/C-interface/ellpack/bml_normalize_ellpack.c @@ -30,6 +30,7 @@ bml_accumulate_offdiag_ellpack( LOG_ERROR("unknown precision\n"); break; } + return NULL; } /** Normalize ellpack matrix given gershgorin bounds. diff --git a/tests/Fortran-tests/io_matrix_typed.F90 b/tests/Fortran-tests/io_matrix_typed.F90 index 1bfd6a1e5..56cb5ae2d 100644 --- a/tests/Fortran-tests/io_matrix_typed.F90 +++ b/tests/Fortran-tests/io_matrix_typed.F90 @@ -20,6 +20,10 @@ end function getpid end interface #endif +#ifdef __IBMC__ .OR. __ibmxl__ + integer, external :: getpid +#endif + contains function test_io_matrix_typed(matrix_type, element_kind, element_precision, n, m) &