Skip to content

Commit

Permalink
pass reference not pointer of UA, overlap shared
Browse files Browse the repository at this point in the history
Co-authored-by: Tobias Ribizel <ribizel@kit.edu>
  • Loading branch information
yhmtsai and upsj committed Aug 4, 2021
1 parent d7f61b4 commit e89a897
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 91 deletions.
7 changes: 4 additions & 3 deletions common/solver/idr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ __global__
const auto tidx = thread::get_thread_id_flat();

__shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
ValueType *__restrict__ reduction_helper = reduction_helper_array;

__shared__ remove_complex<ValueType> reduction_helper_real[block_size];
// they are not be used in the same time.
ValueType *reduction_helper = reduction_helper_array;
auto reduction_helper_real =
reinterpret_cast<remove_complex<ValueType> *>(reduction_helper);

for (size_type row = 0; row < num_rows; row++) {
for (size_type i = 0; i < row; i++) {
Expand Down
20 changes: 10 additions & 10 deletions dpcpp/components/prefix_sum.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,13 @@ template <std::uint32_t block_size, typename ValueType>
void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
ValueType *__restrict__ block_sum,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, block_size> *prefix_helper)
UninitializedArray<ValueType, block_size> &prefix_helper)
{
const auto tidx = thread::get_thread_id_flat(item_ct1);
const auto element_id = item_ct1.get_local_id(2);

// do not need to access the last element when exclusive prefix sum
(*prefix_helper)[element_id] =
prefix_helper[element_id] =
(tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
auto this_block = group::this_thread_block(item_ct1);
this_block.sync();
Expand All @@ -146,17 +146,17 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
(*prefix_helper)[bi] += (*prefix_helper)[ai];
prefix_helper[bi] += prefix_helper[ai];
}
this_block.sync();
}

if (element_id == 0) {
// Store the total sum except the last block
if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
block_sum[item_ct1.get_group(2)] = prefix_helper[block_size - 1];
}
(*prefix_helper)[block_size - 1] = zero<ValueType>();
prefix_helper[block_size - 1] = zero<ValueType>();
}

this_block.sync();
Expand All @@ -167,14 +167,14 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
auto tmp = (*prefix_helper)[ai];
(*prefix_helper)[ai] = (*prefix_helper)[bi];
(*prefix_helper)[bi] += tmp;
auto tmp = prefix_helper[ai];
prefix_helper[ai] = prefix_helper[bi];
prefix_helper[bi] += tmp;
}
this_block.sync();
}
if (tidx < num_elements) {
elements[tidx] = (*prefix_helper)[element_id];
elements[tidx] = prefix_helper[element_id];
}
}

Expand All @@ -193,7 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
[=](sycl::nd_item<3> item_ct1) {
start_prefix_sum<block_size>(
num_elements, elements, block_sum, item_ct1,
prefix_helper_acc_ct1.get_pointer().get());
*prefix_helper_acc_ct1.get_pointer());
});
});
}
Expand Down
8 changes: 4 additions & 4 deletions dpcpp/components/reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,14 +205,14 @@ template <std::uint32_t cfg, typename ValueType>
void reduce_add_array(
size_type size, const ValueType *__restrict__ source,
ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &block_sum)
{
reduce_array<KCFG_1D::decode<1>(cfg)>(
size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
size, source, static_cast<ValueType *>(block_sum), item_ct1,
[](const ValueType &x, const ValueType &y) { return x + y; });

if (item_ct1.get_local_id(2) == 0) {
result[item_ct1.get_group(2)] = (*block_sum)[0];
result[item_ct1.get_group(2)] = block_sum[0];
}
}

Expand All @@ -230,7 +230,7 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
reduce_add_array<cfg>(size, source, result, item_ct1,
block_sum_acc_ct1.get_pointer().get());
*block_sum_acc_ct1.get_pointer());
});
});
}
Expand Down
56 changes: 28 additions & 28 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename OutType,
void compute_partial_reduce(
size_type num_rows, OutType *__restrict__ work, CallableGetValue get_value,
CallableReduce reduce_op, sycl::nd_item<3> item_ct1,
UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<OutType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
Expand All @@ -101,7 +101,7 @@ void compute_partial_reduce(
const auto global_id =
thread::get_thread_id<sg_size, warps_per_block>(item_ct1);

OutType *tmp_work_array = *tmp_work;
OutType *tmp_work_array = tmp_work;
auto tmp = zero<OutType>();
for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) {
tmp = reduce_op(tmp, get_value(i));
Expand All @@ -124,7 +124,7 @@ void finalize_reduce_computation(
size_type size, const ValueType *work, ValueType *result,
CallableReduce reduce_op, CallableFinalize finalize_op,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
Expand All @@ -135,7 +135,7 @@ void finalize_reduce_computation(
for (auto i = local_id; i < size; i += wg_size) {
tmp = reduce_op(tmp, work[i]);
}
ValueType *tmp_work_array = *tmp_work;
ValueType *tmp_work_array = tmp_work;
tmp_work_array[local_id] = tmp;

::gko::kernels::dpcpp::reduce<sg_size>(group::this_thread_block(item_ct1),
Expand All @@ -152,7 +152,7 @@ void compute_partial_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
compute_partial_reduce<cfg>(
num_rows, work,
Expand Down Expand Up @@ -181,7 +181,7 @@ void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory,
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
compute_partial_dot<cfg>(num_rows, x, stride_x, y, stride_y,
work, item_ct1,
tmp_work_acc_ct1.get_pointer().get());
*tmp_work_acc_ct1.get_pointer());
});
});
}
Expand All @@ -197,7 +197,7 @@ void compute_partial_conj_dot(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
const ValueType *__restrict__ y, size_type stride_y,
ValueType *__restrict__ work, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
compute_partial_reduce<cfg>(
num_rows, work,
Expand Down Expand Up @@ -225,9 +225,9 @@ void compute_partial_conj_dot(dim3 grid, dim3 block,

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
compute_partial_conj_dot<cfg>(
num_rows, x, stride_x, y, stride_y, work, item_ct1,
tmp_work_acc_ct1.get_pointer().get());
compute_partial_conj_dot<cfg>(num_rows, x, stride_x, y,
stride_y, work, item_ct1,
*tmp_work_acc_ct1.get_pointer());
});
});
}
Expand All @@ -242,7 +242,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
void finalize_sum_reduce_computation(
size_type size, const ValueType *work, ValueType *result,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
finalize_reduce_computation<cfg>(
size, work, result,
Expand All @@ -267,7 +267,7 @@ void finalize_sum_reduce_computation(dim3 grid, dim3 block,
[=](sycl::nd_item<3> item_ct1) {
finalize_sum_reduce_computation<cfg>(
size, work, result, item_ct1,
tmp_work_acc_ct1.get_pointer().get());
*tmp_work_acc_ct1.get_pointer());
});
});
}
Expand All @@ -283,7 +283,7 @@ void compute_partial_norm2(
size_type num_rows, const ValueType *__restrict__ x, size_type stride_x,
remove_complex<ValueType> *__restrict__ work, sycl::nd_item<3> item_ct1,
UninitializedArray<remove_complex<ValueType>, KCFG_1D::decode<0>(cfg)>
*tmp_work)
&tmp_work)
{
using norm_type = remove_complex<ValueType>;
compute_partial_reduce<cfg>(
Expand All @@ -306,12 +306,12 @@ void compute_partial_norm2(dim3 grid, dim3 block,
sycl::access::target::local>
tmp_work_acc_ct1(cgh);

cgh.parallel_for(sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1) {
compute_partial_norm2<cfg>(
num_rows, x, stride_x, work, item_ct1,
tmp_work_acc_ct1.get_pointer().get());
});
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
compute_partial_norm2<cfg>(num_rows, x, stride_x, work,
item_ct1,
*tmp_work_acc_ct1.get_pointer());
});
});
}

Expand All @@ -325,7 +325,7 @@ template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
void finalize_sqrt_reduce_computation(
size_type size, const ValueType *work, ValueType *result,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *tmp_work)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &tmp_work)
{
finalize_reduce_computation<cfg>(
size, work, result,
Expand All @@ -351,7 +351,7 @@ void finalize_sqrt_reduce_computation(dim3 grid, dim3 block,
[=](sycl::nd_item<3> item_ct1) {
finalize_sqrt_reduce_computation<cfg>(
size, work, result, item_ct1,
tmp_work_acc_ct1.get_pointer().get());
*tmp_work_acc_ct1.get_pointer());
});
});
}
Expand Down Expand Up @@ -677,21 +677,21 @@ void transpose(const size_type nrows, const size_type ncols,
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
Closure op, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
{
auto local_x = item_ct1.get_local_id(2);
auto local_y = item_ct1.get_local_id(1);
auto x = item_ct1.get_group(2) * sg_size + local_x;
auto y = item_ct1.get_group(1) * sg_size + local_y;
if (y < nrows && x < ncols) {
(*space)[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]);
space[local_y * (sg_size + 1) + local_x] = op(in[y * in_stride + x]);
}

item_ct1.barrier(sycl::access::fence_space::local_space);
x = item_ct1.get_group(1) * sg_size + local_x;
y = item_ct1.get_group(2) * sg_size + local_y;
if (y < ncols && x < nrows) {
out[y * out_stride + x] = (*space)[local_x * (sg_size + 1) + local_y];
out[y * out_stride + x] = space[local_x * (sg_size + 1) + local_y];
}
}

Expand All @@ -701,7 +701,7 @@ void transpose(const size_type nrows, const size_type ncols,
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
{
transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride,
Expand All @@ -723,7 +723,7 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
transpose<sg_size>(nrows, ncols, in, in_stride, out, out_stride,
item_ct1, space_acc_ct1.get_pointer().get());
item_ct1, *space_acc_ct1.get_pointer());
});
});
}
Expand All @@ -739,7 +739,7 @@ void conj_transpose(
const ValueType *__restrict__ in, const size_type in_stride,
ValueType *__restrict__ out, const size_type out_stride,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, sg_size *(sg_size + 1)> *space)
UninitializedArray<ValueType, sg_size *(sg_size + 1)> &space)
{
transpose<sg_size>(
nrows, ncols, in, in_stride, out, out_stride,
Expand All @@ -763,7 +763,7 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
conj_transpose<sg_size>(nrows, ncols, in, in_stride, out,
out_stride, item_ct1,
space_acc_ct1.get_pointer().get());
*space_acc_ct1.get_pointer());
});
});
}
Expand Down
15 changes: 6 additions & 9 deletions dpcpp/solver/cb_gmres_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ void multidot_kernel(
size_type stride_next_krylov, const Accessor3d krylov_bases,
ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, dot_dim * dot_dim> *reduction_helper_array)
UninitializedArray<ValueType, dot_dim * dot_dim> &reduction_helper_array)
{
/*
* In general in this kernel:
Expand All @@ -497,8 +497,7 @@ void multidot_kernel(
const size_type k = item_ct1.get_group(0);
// Used that way to get around dynamic initialization warning and
// template error when using `reduction_helper_array` directly in `reduce`

ValueType *__restrict__ reduction_helper = (*reduction_helper_array);
ValueType *__restrict__ reduction_helper = reduction_helper_array;

ValueType local_res = zero<ValueType>();
if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
Expand Down Expand Up @@ -549,8 +548,7 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
num_rows, num_cols, next_krylov_basis, stride_next_krylov,
krylov_bases, hessenberg_iter, stride_hessenberg,
stop_status, item_ct1,
(UninitializedArray<ValueType, dot_dim * dot_dim> *)
reduction_helper_array_acc_ct1.get_pointer());
*reduction_helper_array_acc_ct1.get_pointer());
});
});
}
Expand All @@ -562,7 +560,7 @@ void singledot_kernel(
size_type stride_next_krylov, const Accessor3d krylov_bases,
ValueType *__restrict__ hessenberg_iter, size_type stride_hessenberg,
const stopping_status *__restrict__ stop_status, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, block_size> *reduction_helper_array)
UninitializedArray<ValueType, block_size> &reduction_helper_array)
{
/*
* In general in this kernel:
Expand All @@ -585,7 +583,7 @@ void singledot_kernel(
// Used that way to get around dynamic initialization warning and
// template error when using `reduction_helper_array` directly in `reduce`

ValueType *__restrict__ reduction_helper = (*reduction_helper_array);
ValueType *__restrict__ reduction_helper = reduction_helper_array;

ValueType local_res = zero<ValueType>();
if (!stop_status[col_idx].has_stopped()) {
Expand Down Expand Up @@ -630,8 +628,7 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
num_rows, next_krylov_basis, stride_next_krylov,
krylov_bases, hessenberg_iter, stride_hessenberg,
stop_status, item_ct1,
(UninitializedArray<ValueType, block_size> *)
reduction_helper_array_acc_ct1.get_pointer());
*reduction_helper_array_acc_ct1.get_pointer());
});
});
}
Expand Down
Loading

0 comments on commit e89a897

Please sign in to comment.