Skip to content

Commit

Permalink
Optimize openmp offload for rocsparse
Browse files Browse the repository at this point in the history
  o Address offload bottlenecks in the following from crusher hackathon:
    - bml_set_diagonal_ellpack()
    - bml_threshold_ellpack()
    - bml_trace_ellpack()
  • Loading branch information
mewall authored and cnegre committed Aug 24, 2022
1 parent 3cc4a33 commit a2cbea3
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 26 deletions.
3 changes: 2 additions & 1 deletion src/C-interface/ellpack/bml_setters_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,12 @@ void TYPED_FUNC(
int *A_nnz = A->nnz;

#ifdef USE_OMP_OFFLOAD
#pragma omp target parallel for map(to:diagonal[:A_N])
#pragma omp target teams distribute map(to:diagonal[:A_N])
#endif
for (int i = 0; i < A_N; i++)
{
int ll = 0;
#pragma omp parallel for
for (int j = 0; j < A_nnz[i]; j++)
{
if (A_index[ROWMAJOR(i, j, A_N, A_M)] == i)
Expand Down
39 changes: 17 additions & 22 deletions src/C-interface/ellpack/bml_threshold_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,36 +103,31 @@ void TYPED_FUNC(
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];

int rlen;

#ifdef USE_OMP_OFFLOAD
#pragma omp target
#endif
{
#pragma omp target teams distribute
#else
#pragma omp parallel for \
private(rlen) \
shared(A_value,A_index,A_nnz) \
shared(rowMin, rowMax)
//for (int i = 0; i < N; i++)
for (int i = rowMin; i < rowMax; i++)
#endif
//for (int i = 0; i < N; i++)
for (int i = rowMin; i < rowMax; i++)
{
int rlen = 0;
for (int j = 0; j < A_nnz[i]; j++)
{
rlen = 0;
for (int j = 0; j < A_nnz[i]; j++)
if (is_above_threshold(A_value[ROWMAJOR(i, j, N, M)], threshold))
{
if (is_above_threshold
(A_value[ROWMAJOR(i, j, N, M)], threshold))
if (rlen < j)
{
if (rlen < j)
{
A_value[ROWMAJOR(i, rlen, N, M)] =
A_value[ROWMAJOR(i, j, N, M)];
A_index[ROWMAJOR(i, rlen, N, M)] =
A_index[ROWMAJOR(i, j, N, M)];
}
rlen++;
A_value[ROWMAJOR(i, rlen, N, M)] =
A_value[ROWMAJOR(i, j, N, M)];
A_index[ROWMAJOR(i, rlen, N, M)] =
A_index[ROWMAJOR(i, j, N, M)];
}
rlen++;
}
A_nnz[i] = rlen;
}
} // end target region
A_nnz[i] = rlen;
}
}
29 changes: 26 additions & 3 deletions src/C-interface/ellpack/bml_trace_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,33 @@ double TYPED_FUNC(
int myRank = bml_getMyRank();
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];
int numrows = rowMax - rowMin;

#ifdef USE_OMP_OFFLOAD
#pragma omp target map(tofrom:trace)
#endif

REAL_T *diag;
diag = (REAL_T *) calloc(numrows, sizeof(REAL_T));
#pragma omp target enter data map(to:diag[:numrows])
#pragma omp target teams distribute
//for (int i = 0; i < N; i++)
for (int i = rowMin; i < rowMax; i++)
{
REAL_T this_trace = 0.0;
#pragma omp parallel for shared(this_trace)
for (int j = 0; j < A_nnz[i]; j++)
{
if (i == A_index[ROWMAJOR(i, j, N, M)])
{
this_trace = A_value[ROWMAJOR(i, j, N, M)];
}
}
diag[i - rowMin] = this_trace;
}
#pragma omp target teams distribute parallel for reduction(+:trace)
for (int i = 0; i < numrows; i++)
trace += diag[i];
#pragma omp target exit data map(delete:diag[:numrows])
free(diag);
#else
#pragma omp parallel for \
shared(A_value, A_index, A_nnz) \
reduction(+:trace)
Expand All @@ -64,6 +86,7 @@ double TYPED_FUNC(
}
}
}
#endif

return (double) REAL_PART(trace);
}
Expand Down

0 comments on commit a2cbea3

Please sign in to comment.