Skip to content

Commit

Permalink
add read_distributed device kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Mar 9, 2022
1 parent 72944ef commit 03411ff
Show file tree
Hide file tree
Showing 10 changed files with 214 additions and 52 deletions.
94 changes: 94 additions & 0 deletions common/cuda_hip/distributed/vector_kernels.hpp.inc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2022, the Ginkgo authors
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void build_local(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
{
const auto* range_bounds = partition->get_range_bounds();
const auto* range_starting_indices =
partition->get_range_starting_indices();
const auto* part_ids = partition->get_part_ids();
const auto num_ranges = partition->get_num_ranges();

Array<size_type> range_id{exec, input.get_num_elems()};
thrust::upper_bound(thrust::device, range_bounds + 1,
range_bounds + num_ranges + 1,
input.get_const_row_idxs(),
input.get_const_row_idxs() + input.get_num_elems(),
range_id.get_data(), thrust::less<GlobalIndexType>());

// write values with local rows into the local matrix at the correct index
// this needs the following iterators:
// - local_row_it: (global_row, range_id) -> local row index
// - flat_idx_it: (local_row, col) -> flat index in local matrix values
// array
// the flat_idx_it is used by the scatter_if as an index map for the values
auto map_to_local_row =
[range_bounds, range_starting_indices] GKO_THRUST_LAMBDA(
const auto& idx_range_id) -> LocalIndexType {
const auto idx = thrust::get<0>(idx_range_id);
const auto rid = thrust::get<1>(idx_range_id);
return static_cast<LocalIndexType>(idx - range_bounds[rid]) +
range_starting_indices[rid];
};
auto local_row_it = thrust::make_transform_iterator(
thrust::make_zip_iterator(thrust::make_tuple(input.get_const_row_idxs(),
range_id.get_data())),
map_to_local_row);

auto stride = local_mtx->get_stride();
auto map_to_flat_idx =
[stride] GKO_THRUST_LAMBDA(const auto& row_col) -> size_type {
return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col);
};
auto flat_idx_it = thrust::make_transform_iterator(
thrust::make_zip_iterator(
thrust::make_tuple(local_row_it, input.get_const_col_idxs())),
map_to_flat_idx);

auto is_local_row = [part_ids,
local_part] GKO_THRUST_LAMBDA(const auto rid) {
return part_ids[rid] == local_part;
};
thrust::scatter_if(thrust::device, input.get_const_values(),
input.get_const_values() + input.get_num_elems(),
flat_idx_it, range_id.get_data(),
local_mtx->get_values(), is_local_row);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
23 changes: 14 additions & 9 deletions cuda/distributed/vector_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/base/exception_helpers.hpp>


#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/scatter.h>
#include <thrust/tuple.h>


namespace gko {
namespace kernels {
namespace cuda {
namespace distributed_vector {


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void build_local(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
comm_index_type local_part,
matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
#define GKO_THRUST_LAMBDA __device__


#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"


GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
#undef GKO_THRUST_LAMBDA


} // namespace distributed_vector
Expand Down
55 changes: 53 additions & 2 deletions dpcpp/distributed/vector_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <ginkgo/core/base/exception_helpers.hpp>

#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/iterator>


namespace gko {
namespace kernels {
Expand All @@ -47,8 +50,56 @@ void build_local(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
comm_index_type local_part,
matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
{
const auto* range_bounds = partition->get_range_bounds();
const auto* range_starting_indices =
partition->get_range_starting_indices();
const auto* part_ids = partition->get_part_ids();
const auto num_ranges = partition->get_num_ranges();
auto policy =
oneapi::dpl::execution::make_device_policy(*exec->get_queue());

Array<size_type> range_id{exec, input.get_num_elems()};
oneapi::dpl::upper_bound(policy, range_bounds + 1,
range_bounds + num_ranges + 1,
input.get_const_row_idxs(),
input.get_const_row_idxs() + input.get_num_elems(),
range_id.get_data());

// write values with local rows into the local matrix at the correct index
// this needs the following iterators:
// - local_row_it: (global_row, range_id) -> local row index
// - flat_idx_it: (index) -> flat index (row[index] * stride + col[index])
// in local matrix values array
// the flat_idx_it is used by the copy_if as a permutation index for the
// values
auto map_to_local_row = [range_bounds, range_starting_indices](
const auto& idx_range_id) -> LocalIndexType {
const auto [idx, rid] = idx_range_id;
return static_cast<LocalIndexType>(idx - range_bounds[rid]) +
range_starting_indices[rid];
};
auto local_row_it = oneapi::dpl::make_transform_iterator(
oneapi::dpl::make_zip_iterator(input.get_const_row_idxs(),
range_id.get_data()),
map_to_local_row);

auto flat_idx_it = oneapi::dpl::make_permutation_iterator(
local_mtx->get_values(),
[local_row_it, cols = input.get_const_col_idxs(),
stride = local_mtx->get_stride()](const auto i) {
return local_row_it[i] * stride + cols[i];
});

auto is_local_row = [range_id = range_id.get_data(), part_ids,
local_part](const auto i) {
return part_ids[range_id[i]] == local_part;
};
oneapi::dpl::copy_if(policy, input.get_const_values(),
input.get_const_values() + input.get_num_elems(),
flat_idx_it, is_local_row);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
Expand Down
26 changes: 17 additions & 9 deletions hip/distributed/vector_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <ginkgo/core/base/exception_helpers.hpp>


#include <functional>


#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/scatter.h>
#include <thrust/tuple.h>


namespace gko {
namespace kernels {
namespace hip {
namespace distributed_vector {


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void build_local(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
comm_index_type local_part,
matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
#define GKO_THRUST_LAMBDA __device__ __host__


#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"


GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
#undef GKO_THRUST_LAMBDA


} // namespace distributed_vector
Expand Down
1 change: 0 additions & 1 deletion omp/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ include(${PROJECT_SOURCE_DIR}/cmake/create_test.cmake)

add_subdirectory(base)
add_subdirectory(components)
add_subdirectory(distributed)
add_subdirectory(factorization)
add_subdirectory(matrix)
add_subdirectory(preconditioner)
Expand Down
1 change: 0 additions & 1 deletion omp/test/distributed/CMakeLists.txt

This file was deleted.

1 change: 1 addition & 0 deletions test/distributed/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp)
ginkgo_create_common_test(vector_kernels)
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/test/utils.hpp"


#include "test/utils/executor.hpp"


namespace {


Expand All @@ -56,23 +59,26 @@ using comm_index_type = gko::distributed::comm_index_type;
template <typename ValueLocalGlobalIndexType>
class Vector : public ::testing::Test {
protected:
using value_type =
typename std::tuple_element<0, decltype(
ValueLocalGlobalIndexType())>::type;
using local_index_type =
typename std::tuple_element<1, decltype(
ValueLocalGlobalIndexType())>::type;
using global_index_type =
typename std::tuple_element<2, decltype(
ValueLocalGlobalIndexType())>::type;
using value_type = typename std::tuple_element<
0, decltype(ValueLocalGlobalIndexType())>::type;
using local_index_type = typename std::tuple_element<
1, decltype(ValueLocalGlobalIndexType())>::type;
using global_index_type = typename std::tuple_element<
2, decltype(ValueLocalGlobalIndexType())>::type;
using global_entry = gko::matrix_data_entry<value_type, global_index_type>;
using mtx = gko::matrix::Dense<value_type>;

Vector()
: ref(gko::ReferenceExecutor::create()),
exec(gko::OmpExecutor::create()),
engine(42)
{}
Vector() : ref(gko::ReferenceExecutor::create()), engine(42) {}


void SetUp() { init_executor(ref, exec); }

void TearDown()
{
if (exec != nullptr) {
ASSERT_NO_THROW(exec->synchronize());
}
}

void validate(
const gko::distributed::Partition<local_index_type, global_index_type>*
Expand All @@ -94,20 +100,23 @@ class Vector : public ::testing::Test {

gko::kernels::reference::distributed_vector::build_local(
ref, input, partition, part, output.get());
gko::kernels::omp::distributed_vector::build_local(
gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local(
exec, d_input, d_partition, part, d_output.get());

GKO_ASSERT_MTX_NEAR(output, d_output, 0);
}
}

std::shared_ptr<const gko::ReferenceExecutor> ref;
std::shared_ptr<const gko::OmpExecutor> exec;
std::shared_ptr<gko::ReferenceExecutor> ref;
std::shared_ptr<gko::EXEC_TYPE> exec;
std::default_random_engine engine;
};

TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes);


template <typename ValueType, typename IndexType, typename NonzeroDistribution,
typename ValueDistribution, typename Engine>

gko::device_matrix_data<ValueType, IndexType> generate_random_matrix_data_array(
gko::size_type num_rows, gko::size_type num_cols,
NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist,
Expand All @@ -122,8 +131,6 @@ gko::device_matrix_data<ValueType, IndexType> generate_random_matrix_data_array(
md);
}

TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes);


TYPED_TEST(Vector, BuildsLocalEmptyIsEquivalentToRef)
{
Expand Down
16 changes: 7 additions & 9 deletions test/mpi/distributed/vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class VectorCreation : public ::testing::Test {
void SetUp() override
{
ASSERT_EQ(this->comm.size(), 3);
init_executor(gko::ReferenceExecutor::create(), exec);
init_executor(gko::ReferenceExecutor::create(), exec, comm);
}

void TearDown() override
Expand All @@ -143,7 +143,7 @@ class VectorCreation : public ::testing::Test {
TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes);


#ifdef GKO_COMPILING_REFERENCE
//#ifdef GKO_COMPILING_REFERENCE


TYPED_TEST(VectorCreation, CanReadGlobalMatrixData)
Expand Down Expand Up @@ -317,7 +317,7 @@ TYPED_TEST(VectorCreation, CanReadLocalMatrixDataSomeEmpty)
}


#endif
//#endif


TYPED_TEST(VectorCreation, CanCreateFromLocalVectorAndSize)
Expand Down Expand Up @@ -370,10 +370,7 @@ class VectorReductions : public ::testing::Test {
size{53, 11},
engine(42)
{
init_executor(ref, exec, comm);

logger = gko::share(HostToDeviceLogger::create(exec));
exec->add_logger(logger);
logger = gko::share(HostToDeviceLogger::create(ref));

dense_x = dense_type::create(exec);
dense_y = dense_type::create(exec);
Expand Down Expand Up @@ -420,7 +417,8 @@ class VectorReductions : public ::testing::Test {
void SetUp() override
{
ASSERT_GT(comm.size(), 0);
init_executor(gko::ReferenceExecutor::create(), exec);
init_executor(gko::ReferenceExecutor::create(), exec, comm);
exec->add_logger(logger);
}

void TearDown() override
Expand Down Expand Up @@ -597,7 +595,7 @@ class VectorLocalOps : public ::testing::Test {
void SetUp() override
{
ASSERT_GT(comm.size(), 0);
init_executor(gko::ReferenceExecutor::create(), exec);
init_executor(gko::ReferenceExecutor::create(), exec, comm);
}

void TearDown() override
Expand Down
Loading

0 comments on commit 03411ff

Please sign in to comment.