Skip to content

Commit

Permalink
Fix OMP offload build memory leaks
Browse files Browse the repository at this point in the history
o Fix leaks seen in running progress benchmarks
  - bml_add_ellpack arrays allocated but not freed
  - bml_multiply_ellpack arrays allocated but not freed
o Fix similar leaks in other subroutines
o Also increase efficiency of a target region in bml_prune_rocsparse_ellpack
  • Loading branch information
mewall authored and jeanlucf22 committed Jul 3, 2023
1 parent 8e3f556 commit 5aa44ec
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 33 deletions.
37 changes: 26 additions & 11 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,12 @@ void TYPED_FUNC(
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks])

#pragma omp target enter data map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
Expand All @@ -115,7 +116,7 @@ void TYPED_FUNC(
x = &all_x[chunk * N];

#else
#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
Expand Down Expand Up @@ -197,6 +198,10 @@ void TYPED_FUNC(
}
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
free(all_ix);
free(all_jx);
free(all_x);
#endif

#endif
Expand Down Expand Up @@ -269,11 +274,12 @@ double TYPED_FUNC(
all_x = calloc(N * num_chunks, sizeof(REAL_T));
all_y = calloc(N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)
#pragma omp target enter data map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target map(tofrom:trnorm)
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
Expand All @@ -292,7 +298,7 @@ double TYPED_FUNC(

#else

#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
Expand Down Expand Up @@ -385,6 +391,11 @@ double TYPED_FUNC(
}
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks])
free(all_ix);
free(all_jx);
free(all_x);
free(all_y);
#endif

return trnorm;
Expand Down Expand Up @@ -426,18 +437,19 @@ void TYPED_FUNC(
#if BML_OFFLOAD_CHUNKS
int num_chunks = MIN(BML_OFFLOAD_NUM_CHUNKS, N);

int all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks];
int *all_jx;
REAL_T *all_x;

memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
all_jx = calloc(N * num_chunks, sizeof(int));
all_x = calloc(N * num_chunks, sizeof(REAL_T));

#if defined (USE_OMP_OFFLOAD)
#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
#pragma omp target enter data map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
#endif
#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(N, A_M) \
Expand All @@ -451,7 +463,7 @@ void TYPED_FUNC(
x = &all_x[chunk * N];

#else
#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz) \
firstprivate(jx, x)
Expand Down Expand Up @@ -524,6 +536,9 @@ void TYPED_FUNC(
}
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
free(all_jx);
free(all_x);
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion src/C-interface/ellpack/bml_allocate_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ void TYPED_FUNC(

REAL_T threshold = (REAL_T)threshold_in;

#pragma omp target map(from:nnz)
#pragma omp target map(to:N) map(from:nnz)
{
nnz = csrRowPtr[N];
}
Expand Down
19 changes: 12 additions & 7 deletions src/C-interface/ellpack/bml_element_multiply_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,19 @@ void TYPED_FUNC(
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
int num_chunks = MIN(BML_OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks];
REAL_T all_x[C_N * num_chunks];
int *all_ix, *all_jx;
REAL_T *all_x;

memset(all_ix, 0, C_N * num_chunks * sizeof(int));
memset(all_jx, 0, C_N * num_chunks * sizeof(int));
memset(all_x, 0.0, C_N * num_chunks * sizeof(REAL_T));
all_ix = calloc(C_N * num_chunks, sizeof(int));
all_jx = calloc(C_N * num_chunks, sizeof(int));
all_x = calloc(C_N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])
#pragma omp target enter data map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(A_N, A_M, A_nnz, A_index, A_value) \
Expand All @@ -106,7 +107,7 @@ void TYPED_FUNC(
x = &all_x[chunk * C_N];

#else
#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(A_N, A_M, A_nnz, A_index, A_value) \
shared(A_localRowMin, A_localRowMax) \
shared(B_N, B_M, B_nnz, B_index, B_value) \
Expand Down Expand Up @@ -201,5 +202,9 @@ void TYPED_FUNC(
}
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])
free(all_ix);
free(all_jx);
free(all_x);
#endif
}
29 changes: 22 additions & 7 deletions src/C-interface/ellpack/bml_multiply_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,12 @@ void *TYPED_FUNC(
all_jx = calloc(X_N * num_chunks, sizeof(int));
all_x = calloc(X_N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks])
#pragma omp target enter data map(to:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(X_N, X_M, X_index, X_nnz, X_value) \
Expand All @@ -208,7 +209,7 @@ void *TYPED_FUNC(
x = &all_x[chunk * X_N];

#else
#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(X_N, X_M, X_index, X_nnz, X_value) \
shared(X2_N, X2_M, X2_index, X2_nnz, X2_value) \
shared(rowMin, rowMax) \
Expand Down Expand Up @@ -319,6 +320,10 @@ void *TYPED_FUNC(

#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_ix[0:X_N*num_chunks],all_jx[0:X_N*num_chunks],all_x[0:X_N*num_chunks])
free(all_ix);
free(all_jx);
free(all_x);
#endif

#endif // endif cusparse or rocsparse
Expand Down Expand Up @@ -407,11 +412,12 @@ void TYPED_FUNC(
all_jx = calloc(C_N * num_chunks, sizeof(int));
all_x = calloc(C_N * num_chunks, sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])
#pragma omp target enter data map(to:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target
#if BML_OFFLOAD_CHUNKS
#pragma omp teams distribute parallel for \
shared(A_N, A_M, A_nnz, A_index, A_value) \
Expand All @@ -428,7 +434,7 @@ void TYPED_FUNC(
x = &all_x[chunk * C_N];

#else
#pragma omp target teams distribute parallel for \
#pragma omp teams distribute parallel for \
shared(A_N, A_M, A_nnz, A_index, A_value) \
shared(A_localRowMin, A_localRowMax) \
shared(B_N, B_M, B_nnz, B_index, B_value) \
Expand Down Expand Up @@ -520,6 +526,10 @@ void TYPED_FUNC(
}
#if defined(USE_OMP_OFFLOAD) && BML_OFFLOAD_CHUNKS
}
#pragma omp target exit data map(delete:all_ix[0:C_N*num_chunks],all_jx[0:C_N*num_chunks],all_x[0:C_N*num_chunks])
free(all_ix);
free(all_jx);
free(all_x);
#endif

#endif // endif cusparse or rocsparse
Expand Down Expand Up @@ -1157,7 +1167,8 @@ void TYPED_FUNC(
rocsparse_spgemm_alg_default,
rocsparse_spgemm_stage_buffer_size,
&bufferSize1, NULL));
// hipDeviceSynchronize();
// hipDeviceSynchronize(); // Ensure that the previous call is finished

// Allocate the spgemm working buffer
dBuffer1 = (char *) malloc(sizeof(char) * bufferSize1);
// Allocate the same array on the device
Expand All @@ -1174,7 +1185,8 @@ void TYPED_FUNC(
rocsparse_spgemm_stage_nnz,
&bufferSize1, dBuffer1));
}
// hipDeviceSynchronize();
// hipDeviceSynchronize(); // Ensure that the previous call is finished

// Get nnz value returned by spgemm()
int64_t C_num_rows, C_num_cols, C_nnz_tmp;
BML_CHECK_ROCSPARSE(rocsparse_spmat_get_size
Expand Down Expand Up @@ -1217,6 +1229,9 @@ void TYPED_FUNC(
rocsparse_spgemm_stage_compute,
&bufferSize1, dBuffer1));
}
// hipDeviceSynchronize(); // Ensure that the previous call is finished

// Delete the temporary work array
#pragma omp target exit data map(delete:dBuffer1[:bufferSize1])
// Place the resulting matrix in C
#pragma omp target teams distribute parallel for
Expand All @@ -1237,7 +1252,7 @@ void TYPED_FUNC(
TYPED_FUNC(bml_prune_rocsparse_ellpack) (handle,C,threshold);

// Free the temporary arrays used on the device and host
#pragma omp target exit data map(delete:csrRowPtrC_tmp[:C_num_rows+1],csrColIndC_tmp[:C_nnz_tmp],csrValC_tmp[:C_nnz_tmp],dBuffer1[:bufferSize1])
#pragma omp target exit data map(delete:csrRowPtrC_tmp[:C_num_rows+1],csrColIndC_tmp[:C_nnz_tmp],csrValC_tmp[:C_nnz_tmp])

free(csrRowPtrC_tmp);
free(csrColIndC_tmp);
Expand Down
11 changes: 4 additions & 7 deletions src/C-interface/ellpack/bml_scale_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,11 @@ void TYPED_FUNC(
int *A_nnz = A->nnz;
int *A_index = A->index;
REAL_T scale = *scale_factor;
#pragma omp target teams distribute parallel for collapse(2)
for (int i = 0; i < N; i++)
size_t MbyN = N * M;
#pragma omp target teams distribute parallel for map(to:MbyN,scale)
for (size_t i = 0; i < MbyN; i++)
{
for (int j = 0; j < M; j++)
{
A_value[ROWMAJOR(i, j, M, N)] =
scale * A_value[ROWMAJOR(i, j, M, N)];
}
A_value[i] = scale * A_value[i];
}
#else // offload conditional

Expand Down

0 comments on commit 5aa44ec

Please sign in to comment.