Skip to content

Commit

Permalink
Enable use of IBM XL compilers with openMP Offload support
Browse files Browse the repository at this point in the history
  • Loading branch information
oseikuffuor1 authored and nicolasbock committed Apr 7, 2022
1 parent 986ac31 commit 12d4b86
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 46 deletions.
23 changes: 21 additions & 2 deletions src/C-interface/ellblock/bml_add_ellblock_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ void TYPED_FUNC(

int NB = A->NB;
int MB = A->MB;
int ix[NB], jx[NB];

int *A_nnzb = A->nnzb;
int *A_indexb = A->indexb;
Expand All @@ -55,12 +54,16 @@ void TYPED_FUNC(

int *bsize = A->bsize;

REAL_T *x_ptr[NB];
REAL_T **A_ptr_value = (REAL_T **) A->ptr_value;
REAL_T **B_ptr_value = (REAL_T **) B->ptr_value;

#if !(defined(__IBMC__) || defined(__ibmxl__))
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
memset(jx, 0, NB * sizeof(int));
#endif

int maxbsize = 0;
for (int ib = 0; ib < NB; ib++)
Expand All @@ -74,13 +77,29 @@ void TYPED_FUNC(
REAL_T *x_ptr_storage = calloc(maxbsize2 * NB * nthreads, sizeof(REAL_T));

char xptrset = 0;
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
shared(A_indexb, A_ptr_value, A_nnzb) \
shared(B_indexb, B_ptr_value, B_nnzb) \
shared(x_ptr_storage) \
firstprivate(xptrset)
#else
#pragma omp parallel for \
shared(A_indexb, A_ptr_value, A_nnzb) \
shared(B_indexb, B_ptr_value, B_nnzb) \
shared(x_ptr_storage) \
firstprivate(ix, jx, x_ptr, xptrset)
#endif
for (int ib = 0; ib < NB; ib++)
{

#if defined(__IBMC__) || defined(__ibmxl__)
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
#endif

if (!xptrset)
{
#ifdef _OPENMP
Expand Down
45 changes: 37 additions & 8 deletions src/C-interface/ellblock/bml_multiply_ellblock_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -255,18 +255,20 @@ void *TYPED_FUNC(
int *X2_indexb = X2->indexb;
int *X2_nnzb = X2->nnzb;

int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

REAL_T traceX = 0.0;
REAL_T traceX2 = 0.0;
REAL_T **X_ptr_value = (REAL_T **) X->ptr_value;
REAL_T **X2_ptr_value = (REAL_T **) X2->ptr_value;

double *trace = bml_allocate_memory(sizeof(double) * 2);

#if !(defined(__IBMC__) || defined(__ibmxl__))
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
memset(jx, 0, NB * sizeof(int));
#endif

int maxbsize = 0;
for (int ib = 0; ib < NB; ib++)
Expand Down Expand Up @@ -297,14 +299,26 @@ void *TYPED_FUNC(
TYPED_FUNC(bml_multiply_block4),
TYPED_FUNC(bml_multiply_block5), TYPED_FUNC(bml_multiply_block6)};


#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
firstprivate(xptrset) \
reduction(+: traceX, traceX2)
#else
#pragma omp parallel for \
firstprivate(ix,jx, x_ptr, xptrset) \
reduction(+: traceX, traceX2)

#endif
//loop over row blocks
for (int ib = 0; ib < NB; ib++)
{

#if defined(__IBMC__) || defined(__ibmxl__)
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
#endif

int lb = 0;
if (!xptrset)
{
Expand Down Expand Up @@ -456,15 +470,17 @@ void TYPED_FUNC(
int *C_nnzb = C->nnzb;
int *C_indexb = C->indexb;

int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

REAL_T **A_ptr_value = (REAL_T **) A->ptr_value;
REAL_T **B_ptr_value = (REAL_T **) B->ptr_value;
REAL_T **C_ptr_value = (REAL_T **) C->ptr_value;

#if !(defined(__IBMC__) || defined(__ibmxl__))
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
memset(jx, 0, NB * sizeof(int));
#endif

int maxbsize = 0;
for (int ib = 0; ib < NB; ib++)
Expand Down Expand Up @@ -495,11 +511,24 @@ void TYPED_FUNC(
TYPED_FUNC(bml_multiply_block5), TYPED_FUNC(bml_multiply_block6)};

//loop over row blocks
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
firstprivate( xptrset)
#else
#pragma omp parallel for \
firstprivate(ix, jx, x_ptr, xptrset)
#endif

for (int ib = 0; ib < NB; ib++)
{

#if defined(__IBMC__) || defined(__ibmxl__)
int ix[NB], jx[NB];
REAL_T *x_ptr[NB];

memset(ix, 0, NB * sizeof(int));
#endif

int lb = 0;
if (!xptrset)
{
Expand Down
37 changes: 19 additions & 18 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ void TYPED_FUNC(
memset(x, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
Expand All @@ -81,7 +81,7 @@ void TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
Expand Down Expand Up @@ -116,19 +116,20 @@ void TYPED_FUNC(
firstprivate(ix, jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
{
#else
for (int i = rowMin; i < rowMax; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
int ix[N], jx[N];
REAL_T x[N];

memset(ix, 0, N * sizeof(int));
#endif

#endif
int l = 0;
if (alpha > (double) 0.0 || alpha < (double) 0.0)
for (int jp = 0; jp < A_nnz[i]; jp++)
Expand Down Expand Up @@ -175,7 +176,7 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
}
#endif
}
Expand Down Expand Up @@ -234,7 +235,7 @@ double TYPED_FUNC(
memset(y, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
Expand All @@ -250,7 +251,7 @@ double TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
Expand Down Expand Up @@ -291,11 +292,11 @@ double TYPED_FUNC(
reduction(+:trnorm)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
{
#else
for (int i = rowMin; i < rowMax; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
Expand All @@ -305,7 +306,7 @@ double TYPED_FUNC(

memset(ix, 0, N * sizeof(int));
#endif

#endif
int l = 0;
for (int jp = 0; jp < A_nnz[i]; jp++)
{
Expand Down Expand Up @@ -359,7 +360,7 @@ double TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
}
#endif

Expand Down Expand Up @@ -397,7 +398,7 @@ void TYPED_FUNC(
memset(x, 0.0, A_M * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N);

int all_jx[N * num_chunks];
Expand All @@ -411,7 +412,7 @@ void TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)
#pragma omp teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz)
Expand Down Expand Up @@ -441,18 +442,18 @@ void TYPED_FUNC(
firstprivate(jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
for (int i = chunk; i < N; i = i + num_chunks)
{
#else
for (int i = 0; i < N; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
int jx[A_M];
REAL_T x[A_M];
#endif

#endif
int l = 0;
int diag = -1;

Expand Down Expand Up @@ -495,7 +496,7 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
}
#endif
}
Expand Down
12 changes: 6 additions & 6 deletions src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void TYPED_FUNC(
memset(x, 0.0, C->N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks];
Expand All @@ -89,7 +89,7 @@ void TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__)
#pragma omp teams distribute parallel for \
shared(A_N, A_M, A_nnz, A_index, A_value) \
shared(A_localRowMin, A_localRowMax) \
Expand Down Expand Up @@ -129,19 +129,19 @@ void TYPED_FUNC(
#endif
#endif
//for (int i = 0; i < A_N; i++)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
{
#else
for (int i = rowMin; i < rowMax; i++)
#endif
{
#if defined(__IBMC__) || defined(__ibmxl__)
int ix[C_N], jx[C_N];
REAL_T x[C_N];

memset(ix, 0, C_N * sizeof(int));
#endif

#endif
int l = 0;
for (int jp = 0; jp < A_nnz[i]; jp++)
{
Expand Down Expand Up @@ -198,7 +198,7 @@ void TYPED_FUNC(
}
C_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK) || defined(__IBMC__) || defined(__ibmxl__))
}
#endif
}
Loading

0 comments on commit 12d4b86

Please sign in to comment.