Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dpcpp ports IDR #849

Merged
merged 8 commits into from
Aug 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions common/solver/idr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ __global__
const auto tidx = thread::get_thread_id_flat();

__shared__ UninitializedArray<ValueType, block_size> reduction_helper_array;
ValueType *__restrict__ reduction_helper = reduction_helper_array;

__shared__ remove_complex<ValueType> reduction_helper_real[block_size];
// they are not be used in the same time.
ValueType *reduction_helper = reduction_helper_array;
auto reduction_helper_real =
reinterpret_cast<remove_complex<ValueType> *>(reduction_helper);

for (size_type row = 0; row < num_rows; row++) {
for (size_type i = 0; i < row; i++) {
Expand All @@ -70,9 +71,9 @@ __global__
dot += values[row * stride + j] * conj(values[i * stride + j]);
}

reduction_helper[tidx] = dot;

// Ensure already finish reading this shared memory
__syncthreads();
reduction_helper[tidx] = dot;
reduce(
group::this_thread_block(), reduction_helper,
[](const ValueType &a, const ValueType &b) { return a + b; });
Expand All @@ -88,10 +89,9 @@ __global__
for (size_type j = tidx; j < num_cols; j += block_size) {
norm += squared_norm(values[row * stride + j]);
}

reduction_helper_real[tidx] = norm;

// Ensure already finish reading this shared memory
__syncthreads();
reduction_helper_real[tidx] = norm;
reduce(group::this_thread_block(), reduction_helper_real,
[](const remove_complex<ValueType> &a,
const remove_complex<ValueType> &b) { return a + b; });
Expand Down
47 changes: 9 additions & 38 deletions cuda/test/solver/idr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,6 @@ class Idr : public ::testing::Test {
ref = gko::ReferenceExecutor::create();
cuda = gko::CudaExecutor::create(0, ref);

mtx = gen_mtx(123, 123);
d_mtx = Mtx::create(cuda);
d_mtx->copy_from(mtx.get());
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand Down Expand Up @@ -100,11 +97,11 @@ class Idr : public ::testing::Test {
std::normal_distribution<>(0.0, 1.0), rand_engine, ref);
}

void initialize_data()
void initialize_data(int size = 597, int input_nrhs = 17)
{
int size = 597;
nrhs = 17;
nrhs = input_nrhs;
int s = 4;
mtx = gen_mtx(size, size);
x = gen_mtx(size, nrhs);
b = gen_mtx(size, nrhs);
r = gen_mtx(size, nrhs);
Expand All @@ -125,6 +122,7 @@ class Idr : public ::testing::Test {
stop_status->get_data()[i].reset();
}

d_mtx = Mtx::create(cuda);
d_x = Mtx::create(cuda);
d_b = Mtx::create(cuda);
d_r = Mtx::create(cuda);
Expand All @@ -142,6 +140,7 @@ class Idr : public ::testing::Test {
d_stop_status = std::unique_ptr<gko::Array<gko::stopping_status>>(
new gko::Array<gko::stopping_status>(cuda));

d_mtx->copy_from(mtx.get());
d_x->copy_from(x.get());
d_b->copy_from(b.get());
d_r->copy_from(r.get());
Expand Down Expand Up @@ -291,16 +290,9 @@ TEST_F(Idr, IdrComputeOmegaIsEquivalentToRef)

TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
auto ref_solver = ref_idr_factory->generate(mtx);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -312,8 +304,7 @@ TEST_F(Idr, IdrIterationOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
{
int m = 123;
int n = 1;
initialize_data(123, 1);
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -330,12 +321,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)
.on(ref);
auto ref_solver = ref_idr_factory->generate(mtx);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -347,16 +332,9 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceOneRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 16);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand All @@ -368,8 +346,7 @@ TEST_F(Idr, IdrIterationMultipleRHSIsEquivalentToRef)

TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
{
int m = 123;
int n = 16;
initialize_data(123, 16);
cuda_idr_factory =
Solver::build()
.with_deterministic(true)
Expand All @@ -386,12 +363,6 @@ TEST_F(Idr, IdrIterationWithComplexSubspaceMultipleRHSIsEquivalentToRef)
.on(ref);
auto cuda_solver = cuda_idr_factory->generate(d_mtx);
auto ref_solver = ref_idr_factory->generate(mtx);
auto b = gen_mtx(m, n);
auto x = gen_mtx(m, n);
auto d_b = Mtx::create(cuda);
auto d_x = Mtx::create(cuda);
d_b->copy_from(b.get());
d_x->copy_from(x.get());

ref_solver->apply(b.get(), x.get());
cuda_solver->apply(d_b.get(), d_x.get());
Expand Down
4 changes: 3 additions & 1 deletion dpcpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ set(GINKGO_DPCPP_VERSION ${GINKGO_DPCPP_VERSION} PARENT_SCOPE)

find_package(MKL CONFIG REQUIRED HINTS "$ENV{MKLROOT}")
set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be like MKL?

Suggested change
find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
find_package(oneDPL CONFIG REQUIRED HINTS "$ENV{DPL_ROOT}")

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are not necessarily consistent inside Intel's documentation, and since there is no FindoneDPL.cmake module inside CMake (and I don't think there will be in the forseeable future), it doesn't really matter, except for slightly changing the error message.

set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE)

add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
target_sources(ginkgo_dpcpp
Expand Down Expand Up @@ -75,7 +77,7 @@ else ()
target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
endif()
target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP)
target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
if (GINKGO_DPCPP_SINGLE_MODE)
target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1)
endif()
Expand Down
20 changes: 10 additions & 10 deletions dpcpp/components/prefix_sum.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,13 @@ template <std::uint32_t block_size, typename ValueType>
void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
ValueType *__restrict__ block_sum,
sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, block_size> *prefix_helper)
UninitializedArray<ValueType, block_size> &prefix_helper)
{
const auto tidx = thread::get_thread_id_flat(item_ct1);
const auto element_id = item_ct1.get_local_id(2);

// do not need to access the last element when exclusive prefix sum
(*prefix_helper)[element_id] =
prefix_helper[element_id] =
(tidx + 1 < num_elements) ? elements[tidx] : zero<ValueType>();
auto this_block = group::this_thread_block(item_ct1);
this_block.sync();
Expand All @@ -146,17 +146,17 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
(*prefix_helper)[bi] += (*prefix_helper)[ai];
prefix_helper[bi] += prefix_helper[ai];
}
this_block.sync();
}

if (element_id == 0) {
// Store the total sum except the last block
if (item_ct1.get_group(2) + 1 < item_ct1.get_group_range(2)) {
block_sum[item_ct1.get_group(2)] = (*prefix_helper)[block_size - 1];
block_sum[item_ct1.get_group(2)] = prefix_helper[block_size - 1];
}
(*prefix_helper)[block_size - 1] = zero<ValueType>();
prefix_helper[block_size - 1] = zero<ValueType>();
}

this_block.sync();
Expand All @@ -167,14 +167,14 @@ void start_prefix_sum(size_type num_elements, ValueType *__restrict__ elements,
const auto ai = i * (2 * element_id + 1) - 1;
const auto bi = i * (2 * element_id + 2) - 1;
if (bi < block_size) {
auto tmp = (*prefix_helper)[ai];
(*prefix_helper)[ai] = (*prefix_helper)[bi];
(*prefix_helper)[bi] += tmp;
auto tmp = prefix_helper[ai];
prefix_helper[ai] = prefix_helper[bi];
prefix_helper[bi] += tmp;
}
this_block.sync();
}
if (tidx < num_elements) {
elements[tidx] = (*prefix_helper)[element_id];
elements[tidx] = prefix_helper[element_id];
}
}

Expand All @@ -193,7 +193,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
[=](sycl::nd_item<3> item_ct1) {
start_prefix_sum<block_size>(
num_elements, elements, block_sum, item_ct1,
prefix_helper_acc_ct1.get_pointer().get());
*prefix_helper_acc_ct1.get_pointer());
});
});
}
Expand Down
8 changes: 4 additions & 4 deletions dpcpp/components/reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,14 +205,14 @@ template <std::uint32_t cfg, typename ValueType>
void reduce_add_array(
size_type size, const ValueType *__restrict__ source,
ValueType *__restrict__ result, sycl::nd_item<3> item_ct1,
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> *block_sum)
UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)> &block_sum)
{
reduce_array<KCFG_1D::decode<1>(cfg)>(
size, source, static_cast<ValueType *>((*block_sum)), item_ct1,
size, source, static_cast<ValueType *>(block_sum), item_ct1,
[](const ValueType &x, const ValueType &y) { return x + y; });

if (item_ct1.get_local_id(2) == 0) {
result[item_ct1.get_group(2)] = (*block_sum)[0];
result[item_ct1.get_group(2)] = block_sum[0];
}
}

Expand All @@ -230,7 +230,7 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
reduce_add_array<cfg>(size, source, result, item_ct1,
block_sum_acc_ct1.get_pointer().get());
*block_sum_acc_ct1.get_pointer());
});
});
}
Expand Down
Loading