From fc90d8c98793cc3deceec6e9aa5b244b0adbd118 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 9 Mar 2022 14:26:48 +0100 Subject: [PATCH 1/8] add read_distributed device kernels --- .../distributed/vector_kernels.hpp.inc | 94 +++++++++++++++++++ cuda/distributed/vector_kernels.cu | 23 +++-- dpcpp/distributed/vector_kernels.dp.cpp | 55 ++++++++++- hip/distributed/vector_kernels.hip.cpp | 26 +++-- omp/test/distributed/CMakeLists.txt | 1 - test/distributed/CMakeLists.txt | 1 + .../distributed/vector_kernels.cpp | 32 ++++--- test/mpi/distributed/vector.cpp | 16 ++-- third_party/gtest/CMakeLists.txt | 2 +- 9 files changed, 208 insertions(+), 42 deletions(-) create mode 100644 common/cuda_hip/distributed/vector_kernels.hpp.inc delete mode 100644 omp/test/distributed/CMakeLists.txt rename {omp/test => test}/distributed/vector_kernels.cpp (95%) diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc new file mode 100644 index 00000000000..28be6a512a7 --- /dev/null +++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc @@ -0,0 +1,94 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void build_local( + std::shared_ptr exec, + const device_matrix_data& input, + const distributed::Partition* partition, + comm_index_type local_part, matrix::Dense* local_mtx) +{ + const auto* range_bounds = partition->get_range_bounds(); + const auto* range_starting_indices = + partition->get_range_starting_indices(); + const auto* part_ids = partition->get_part_ids(); + const auto num_ranges = partition->get_num_ranges(); + + Array range_id{exec, input.get_num_elems()}; + thrust::upper_bound(thrust::device, range_bounds + 1, + range_bounds + num_ranges + 1, + input.get_const_row_idxs(), + input.get_const_row_idxs() + input.get_num_elems(), + range_id.get_data(), thrust::less()); + + // write values with local rows into the local matrix at the correct index + // this needs the following iterators: + // - local_row_it: (global_row, range_id) -> local row index + // - flat_idx_it: (local_row, col) -> flat index in local matrix values + // array + // the flat_idx_it is used by the scatter_if as an index map for the values + auto map_to_local_row = + [range_bounds, range_starting_indices] GKO_THRUST_LAMBDA( + const auto& idx_range_id) -> LocalIndexType { + const auto idx = thrust::get<0>(idx_range_id); + const auto rid = thrust::get<1>(idx_range_id); + return static_cast(idx - range_bounds[rid]) + + range_starting_indices[rid]; + }; + auto local_row_it = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(input.get_const_row_idxs(), + range_id.get_data())), + map_to_local_row); + + auto stride = local_mtx->get_stride(); + auto map_to_flat_idx = + [stride] GKO_THRUST_LAMBDA(const auto& row_col) -> size_type { + return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); + }; + auto flat_idx_it = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(local_row_it, input.get_const_col_idxs())), + map_to_flat_idx); + + auto is_local_row = [part_ids, + local_part] GKO_THRUST_LAMBDA(const auto rid) { + return part_ids[rid] == local_part; + }; + thrust::scatter_if(thrust::device, input.get_const_values(), + input.get_const_values() + input.get_num_elems(), + flat_idx_it, range_id.get_data(), + local_mtx->get_values(), is_local_row); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index 46d834ee0ca..127bd2b06c3 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -36,22 +36,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include +#include +#include +#include +#include + + namespace gko { namespace kernels { namespace cuda { namespace distributed_vector { -template -void build_local( - std::shared_ptr exec, - const device_matrix_data& input, - const distributed::Partition* partition, - comm_index_type local_part, - matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; +#define GKO_THRUST_LAMBDA __device__ + + +#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" + -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); +#undef GKO_THRUST_LAMBDA } // namespace distributed_vector diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index a51c9e22669..6b4a1a079ea 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -35,6 +35,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include + namespace gko { namespace kernels { @@ -47,8 +50,56 @@ void build_local( std::shared_ptr exec, const device_matrix_data& input, const distributed::Partition* partition, - comm_index_type local_part, - matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; + comm_index_type local_part, matrix::Dense* local_mtx) +{ + const auto* range_bounds = partition->get_range_bounds(); + const auto* range_starting_indices = + partition->get_range_starting_indices(); + const auto* part_ids = partition->get_part_ids(); + const auto num_ranges = partition->get_num_ranges(); + auto policy = + oneapi::dpl::execution::make_device_policy(*exec->get_queue()); + + Array range_id{exec, input.get_num_elems()}; + oneapi::dpl::upper_bound(policy, range_bounds + 1, + range_bounds + num_ranges + 1, + input.get_const_row_idxs(), + input.get_const_row_idxs() + input.get_num_elems(), + range_id.get_data()); + + // write values with local rows into the local matrix at the correct index + // this needs the following iterators: + // - local_row_it: (global_row, range_id) -> local row index + // - flat_idx_it: (index) -> flat index (row[index] * stride + col[index]) + // in local matrix values array + // the flat_idx_it is used by the copy_if as a permutation index for the + // values + auto map_to_local_row = [range_bounds, range_starting_indices]( + const auto& idx_range_id) -> LocalIndexType { + const auto [idx, rid] = idx_range_id; + return static_cast(idx - range_bounds[rid]) + + range_starting_indices[rid]; + }; + auto local_row_it = oneapi::dpl::make_transform_iterator( + oneapi::dpl::make_zip_iterator(input.get_const_row_idxs(), + range_id.get_data()), + map_to_local_row); + + auto flat_idx_it = oneapi::dpl::make_permutation_iterator( + local_mtx->get_values(), + [local_row_it, cols = input.get_const_col_idxs(), + stride = local_mtx->get_stride()](const auto i) { + return local_row_it[i] * stride + cols[i]; + }); + + auto is_local_row = [range_id = range_id.get_data(), part_ids, + local_part](const auto i) { + return part_ids[range_id[i]] == local_part; + }; + oneapi::dpl::copy_if(policy, input.get_const_values(), + input.get_const_values() + input.get_num_elems(), + flat_idx_it, is_local_row); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index 1133317e4e4..58bac94d28f 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -36,22 +36,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#include +#include +#include +#include +#include +#include + + namespace gko { namespace kernels { namespace hip { namespace distributed_vector { -template -void build_local( - std::shared_ptr exec, - const device_matrix_data& input, - const distributed::Partition* partition, - comm_index_type local_part, - matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; +#define GKO_THRUST_LAMBDA __device__ __host__ + + +#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" + -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); +#undef GKO_THRUST_LAMBDA } // namespace distributed_vector diff --git a/omp/test/distributed/CMakeLists.txt b/omp/test/distributed/CMakeLists.txt deleted file mode 100644 index 61e5d60cb39..00000000000 --- a/omp/test/distributed/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -ginkgo_create_test(vector_kernels) diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt index b4e2fbff054..4efad911912 100644 --- a/test/distributed/CMakeLists.txt +++ b/test/distributed/CMakeLists.txt @@ -1 +1,2 @@ ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_test(vector_kernels) diff --git a/omp/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp similarity index 95% rename from omp/test/distributed/vector_kernels.cpp rename to test/distributed/vector_kernels.cpp index 5ee65cfb24c..a5bd8a4dfaa 100644 --- a/omp/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + namespace { @@ -68,11 +71,17 @@ class Vector : public ::testing::Test { using global_entry = gko::matrix_data_entry; using mtx = gko::matrix::Dense; - Vector() - : ref(gko::ReferenceExecutor::create()), - exec(gko::OmpExecutor::create()), - engine(42) - {} + Vector() : ref(gko::ReferenceExecutor::create()), engine(42) {} + + + void SetUp() { init_executor(ref, exec); } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } void validate( const gko::distributed::Partition* @@ -94,20 +103,23 @@ class Vector : public ::testing::Test { gko::kernels::reference::distributed_vector::build_local( ref, input, partition, part, output.get()); - gko::kernels::omp::distributed_vector::build_local( + gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local( exec, d_input, d_partition, part, d_output.get()); GKO_ASSERT_MTX_NEAR(output, d_output, 0); } } - std::shared_ptr ref; - std::shared_ptr exec; + std::shared_ptr ref; + std::shared_ptr exec; std::default_random_engine engine; }; + +TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes); + + template - gko::device_matrix_data generate_random_matrix_data_array( gko::size_type num_rows, gko::size_type num_cols, NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist, @@ -122,8 +134,6 @@ gko::device_matrix_data generate_random_matrix_data_array( md); } -TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes); - TYPED_TEST(Vector, BuildsLocalEmptyIsEquivalentToRef) { diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/distributed/vector.cpp index 3f8abcc8422..3aeaa62985c 100644 --- a/test/mpi/distributed/vector.cpp +++ b/test/mpi/distributed/vector.cpp @@ -119,7 +119,7 @@ class VectorCreation : public ::testing::Test { void SetUp() override { ASSERT_EQ(this->comm.size(), 3); - init_executor(gko::ReferenceExecutor::create(), exec); + init_executor(gko::ReferenceExecutor::create(), exec, comm); } void TearDown() override @@ -146,7 +146,7 @@ class VectorCreation : public ::testing::Test { TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes); -#ifdef GKO_COMPILING_REFERENCE +//#ifdef GKO_COMPILING_REFERENCE TYPED_TEST(VectorCreation, CanReadGlobalMatrixData) @@ -320,7 +320,7 @@ TYPED_TEST(VectorCreation, CanReadLocalMatrixDataSomeEmpty) } -#endif +//#endif TYPED_TEST(VectorCreation, CanCreateFromLocalVectorAndSize) @@ -373,10 +373,7 @@ class VectorReductions : public ::testing::Test { size{53, 11}, engine(42) { - init_executor(ref, exec, comm); - - logger = gko::share(HostToDeviceLogger::create(exec)); - exec->add_logger(logger); + logger = gko::share(HostToDeviceLogger::create(ref)); dense_x = dense_type::create(exec); dense_y = dense_type::create(exec); @@ -423,7 +420,8 @@ class VectorReductions : public ::testing::Test { void SetUp() override { ASSERT_GT(comm.size(), 0); - init_executor(gko::ReferenceExecutor::create(), exec); + init_executor(gko::ReferenceExecutor::create(), exec, comm); + exec->add_logger(logger); } void TearDown() override @@ -600,7 +598,7 @@ class VectorLocalOps : public ::testing::Test { void SetUp() override { ASSERT_GT(comm.size(), 0); - init_executor(gko::ReferenceExecutor::create(), exec); + init_executor(gko::ReferenceExecutor::create(), exec, comm); } void TearDown() override diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt index 77698fd7a9f..66b15b975ef 100644 --- a/third_party/gtest/CMakeLists.txt +++ b/third_party/gtest/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG release-1.11.0 + GIT_TAG main ) # need to set the variables in CACHE due to CMP0077 set(gtest_disable_pthreads ON CACHE INTERNAL "") From e15b1e4e6c45ddd11921e4c9c86f8c84cdeff908 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Wed, 9 Mar 2022 15:17:07 +0100 Subject: [PATCH 2/8] fix formatting and test --- common/cuda_hip/distributed/vector_kernels.hpp.inc | 9 ++++++--- cuda/distributed/vector_kernels.cu | 6 +++--- dpcpp/distributed/vector_kernels.dp.cpp | 5 +++-- hip/distributed/vector_kernels.hip.cpp | 6 +++--- test/mpi/distributed/vector.cpp | 11 ++++++----- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc index 28be6a512a7..f21bbb2d706 100644 --- a/common/cuda_hip/distributed/vector_kernels.hpp.inc +++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc @@ -59,7 +59,8 @@ void build_local( // the flat_idx_it is used by the scatter_if as an index map for the values auto map_to_local_row = [range_bounds, range_starting_indices] GKO_THRUST_LAMBDA( - const auto& idx_range_id) -> LocalIndexType { + const thrust::tuple& idx_range_id) + -> LocalIndexType { const auto idx = thrust::get<0>(idx_range_id); const auto rid = thrust::get<1>(idx_range_id); return static_cast(idx - range_bounds[rid]) + @@ -72,7 +73,9 @@ void build_local( auto stride = local_mtx->get_stride(); auto map_to_flat_idx = - [stride] GKO_THRUST_LAMBDA(const auto& row_col) -> size_type { + [stride] GKO_THRUST_LAMBDA( + const thrust::tuple& row_col) + -> size_type { return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); }; auto flat_idx_it = thrust::make_transform_iterator( @@ -81,7 +84,7 @@ void build_local( map_to_flat_idx); auto is_local_row = [part_ids, - local_part] GKO_THRUST_LAMBDA(const auto rid) { + local_part] GKO_THRUST_LAMBDA(const size_type rid) { return part_ids[rid] == local_part; }; thrust::scatter_if(thrust::device, input.get_const_values(), diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index 127bd2b06c3..168bc4eabdc 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/distributed/vector_kernels.hpp" -#include - - #include #include #include @@ -44,6 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + namespace gko { namespace kernels { namespace cuda { diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index 6b4a1a079ea..45ffed4c1ed 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -33,12 +33,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/distributed/vector_kernels.hpp" -#include - #include #include +#include + + namespace gko { namespace kernels { namespace dpcpp { diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index 58bac94d28f..bbc7ee1eb1b 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -33,9 +33,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/distributed/vector_kernels.hpp" -#include - - #include @@ -47,6 +44,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + namespace gko { namespace kernels { namespace hip { diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/distributed/vector.cpp index 3aeaa62985c..23dd87ba062 100644 --- a/test/mpi/distributed/vector.cpp +++ b/test/mpi/distributed/vector.cpp @@ -114,12 +114,13 @@ class VectorCreation : public ::testing::Test { size{local_size[1] * comm.size(), 11}, md{{0, 1}, {2, 3}, {4, 5}, {6, 7}, {8, 9}, {10, 11}}, md_localized{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}, {{8, 9}, {10, 11}}} - {} + { + init_executor(gko::ReferenceExecutor::create(), exec, comm); + } void SetUp() override { ASSERT_EQ(this->comm.size(), 3); - init_executor(gko::ReferenceExecutor::create(), exec, comm); } void TearDown() override @@ -373,7 +374,10 @@ class VectorReductions : public ::testing::Test { size{53, 11}, engine(42) { + init_executor(gko::ReferenceExecutor::create(), exec, comm); + logger = gko::share(HostToDeviceLogger::create(ref)); + exec->add_logger(logger); dense_x = dense_type::create(exec); dense_y = dense_type::create(exec); @@ -420,8 +424,6 @@ class VectorReductions : public ::testing::Test { void SetUp() override { ASSERT_GT(comm.size(), 0); - init_executor(gko::ReferenceExecutor::create(), exec, comm); - exec->add_logger(logger); } void TearDown() override @@ -598,7 +600,6 @@ class VectorLocalOps : public ::testing::Test { void SetUp() override { ASSERT_GT(comm.size(), 0); - init_executor(gko::ReferenceExecutor::create(), exec, comm); } void TearDown() override From 9bd29195c49c1301f29907d1a189857e1f76b81d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 23 May 2022 10:42:09 +0200 Subject: [PATCH 3/8] always mark lambdas as host device --- common/cuda_hip/distributed/vector_kernels.hpp.inc | 12 ++++++------ cuda/distributed/vector_kernels.cu | 6 ------ hip/distributed/vector_kernels.hip.cpp | 6 ------ 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc index f21bbb2d706..504f7153062 100644 --- a/common/cuda_hip/distributed/vector_kernels.hpp.inc +++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc @@ -58,7 +58,7 @@ void build_local( // array // the flat_idx_it is used by the scatter_if as an index map for the values auto map_to_local_row = - [range_bounds, range_starting_indices] GKO_THRUST_LAMBDA( + [range_bounds, range_starting_indices] __host__ __device__( const thrust::tuple& idx_range_id) -> LocalIndexType { const auto idx = thrust::get<0>(idx_range_id); @@ -73,7 +73,7 @@ void build_local( auto stride = local_mtx->get_stride(); auto map_to_flat_idx = - [stride] GKO_THRUST_LAMBDA( + [stride] __host__ __device__( const thrust::tuple& row_col) -> size_type { return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); @@ -83,10 +83,10 @@ void build_local( thrust::make_tuple(local_row_it, input.get_const_col_idxs())), map_to_flat_idx); - auto is_local_row = [part_ids, - local_part] GKO_THRUST_LAMBDA(const size_type rid) { - return part_ids[rid] == local_part; - }; + auto is_local_row = + [part_ids, local_part] __host__ __device__(const size_type rid) { + return part_ids[rid] == local_part; + }; thrust::scatter_if(thrust::device, input.get_const_values(), input.get_const_values() + input.get_num_elems(), flat_idx_it, range_id.get_data(), diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu index 168bc4eabdc..def3fc8ec87 100644 --- a/cuda/distributed/vector_kernels.cu +++ b/cuda/distributed/vector_kernels.cu @@ -50,15 +50,9 @@ namespace cuda { namespace distributed_vector { -#define GKO_THRUST_LAMBDA __device__ - - #include "common/cuda_hip/distributed/vector_kernels.hpp.inc" -#undef GKO_THRUST_LAMBDA - - } // namespace distributed_vector } // namespace cuda } // namespace kernels diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp index bbc7ee1eb1b..6cbfa1224e9 100644 --- a/hip/distributed/vector_kernels.hip.cpp +++ b/hip/distributed/vector_kernels.hip.cpp @@ -53,15 +53,9 @@ namespace hip { namespace distributed_vector { -#define GKO_THRUST_LAMBDA __device__ __host__ - - #include "common/cuda_hip/distributed/vector_kernels.hpp.inc" -#undef GKO_THRUST_LAMBDA - - } // namespace distributed_vector } // namespace hip } // namespace kernels From fb270381f52192a4c4225731a7fe81baed7557e6 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 23 May 2022 10:43:33 +0200 Subject: [PATCH 4/8] wip: update dpcpp kernels --- dpcpp/distributed/vector_kernels.dp.cpp | 47 +++++++++++++++---------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index 45ffed4c1ed..6534418b454 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -30,11 +30,16 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include "core/distributed/vector_kernels.hpp" - +// force-top: on +// oneDPL needs to be first to avoid issues with libstdc++ TBB impl #include +#include #include +// force-top: off + + +#include "core/distributed/vector_kernels.hpp" #include @@ -81,25 +86,31 @@ void build_local( return static_cast(idx - range_bounds[rid]) + range_starting_indices[rid]; }; - auto local_row_it = oneapi::dpl::make_transform_iterator( - oneapi::dpl::make_zip_iterator(input.get_const_row_idxs(), - range_id.get_data()), - map_to_local_row); - - auto flat_idx_it = oneapi::dpl::make_permutation_iterator( - local_mtx->get_values(), - [local_row_it, cols = input.get_const_col_idxs(), - stride = local_mtx->get_stride()](const auto i) { - return local_row_it[i] * stride + cols[i]; + Array flat_idx_map{exec, input.get_num_elems()}; + auto zip_it = oneapi::dpl::make_zip_iterator(input.get_const_row_idxs(), + input.get_const_col_idxs(), + range_id.get_const_data()); + oneapi::dpl::transform( + policy, zip_it, zip_it + input.get_num_elems(), flat_idx_map.get_data(), + [cols = input.get_const_col_idxs(), stride = local_mtx->get_stride(), + map_to_local_row](const auto t) { + auto [row, col, rid] = t; + auto local_row = map_to_local_row(std::make_tuple(row, rid)); + return local_row * stride + col; }); + auto flat_idx_it = oneapi::dpl::make_permutation_iterator( + local_mtx->get_values(), flat_idx_map.get_data()); - auto is_local_row = [range_id = range_id.get_data(), part_ids, - local_part](const auto i) { - return part_ids[range_id[i]] == local_part; + auto is_local_row = [part_ids, local_part](const auto t) { + return part_ids[std::get<1>(t)] == local_part; }; - oneapi::dpl::copy_if(policy, input.get_const_values(), - input.get_const_values() + input.get_num_elems(), - flat_idx_it, is_local_row); + auto value_rid_it = oneapi::dpl::make_zip_iterator( + input.get_const_values(), range_id.get_const_data()); + oneapi::dpl::copy_if(policy, value_rid_it, + value_rid_it + input.get_num_elems(), + oneapi::dpl::make_zip_iterator( + flat_idx_it, oneapi::dpl::discard_iterator()), + is_local_row); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( From f441fdf2cf8616b4d837efcd853186087b6ae83a Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 23 May 2022 10:45:48 +0200 Subject: [PATCH 5/8] disable dpcpp kernels The current implementation is not working, so it's disabled to push the cuda & hip implementation --- dpcpp/distributed/vector_kernels.dp.cpp | 66 +------------------------ test/distributed/CMakeLists.txt | 2 +- test/mpi/distributed/vector.cpp | 4 +- 3 files changed, 5 insertions(+), 67 deletions(-) diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index 6534418b454..f1a7e49d876 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -31,14 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -// force-top: on -// oneDPL needs to be first to avoid issues with libstdc++ TBB impl -#include -#include -#include -// force-top: off - - #include "core/distributed/vector_kernels.hpp" @@ -56,62 +48,8 @@ void build_local( std::shared_ptr exec, const device_matrix_data& input, const distributed::Partition* partition, - comm_index_type local_part, matrix::Dense* local_mtx) -{ - const auto* range_bounds = partition->get_range_bounds(); - const auto* range_starting_indices = - partition->get_range_starting_indices(); - const auto* part_ids = partition->get_part_ids(); - const auto num_ranges = partition->get_num_ranges(); - auto policy = - oneapi::dpl::execution::make_device_policy(*exec->get_queue()); - - Array range_id{exec, input.get_num_elems()}; - oneapi::dpl::upper_bound(policy, range_bounds + 1, - range_bounds + num_ranges + 1, - input.get_const_row_idxs(), - input.get_const_row_idxs() + input.get_num_elems(), - range_id.get_data()); - - // write values with local rows into the local matrix at the correct index - // this needs the following iterators: - // - local_row_it: (global_row, range_id) -> local row index - // - flat_idx_it: (index) -> flat index (row[index] * stride + col[index]) - // in local matrix values array - // the flat_idx_it is used by the copy_if as a permutation index for the - // values - auto map_to_local_row = [range_bounds, range_starting_indices]( - const auto& idx_range_id) -> LocalIndexType { - const auto [idx, rid] = idx_range_id; - return static_cast(idx - range_bounds[rid]) + - range_starting_indices[rid]; - }; - Array flat_idx_map{exec, input.get_num_elems()}; - auto zip_it = oneapi::dpl::make_zip_iterator(input.get_const_row_idxs(), - input.get_const_col_idxs(), - range_id.get_const_data()); - oneapi::dpl::transform( - policy, zip_it, zip_it + input.get_num_elems(), flat_idx_map.get_data(), - [cols = input.get_const_col_idxs(), stride = local_mtx->get_stride(), - map_to_local_row](const auto t) { - auto [row, col, rid] = t; - auto local_row = map_to_local_row(std::make_tuple(row, rid)); - return local_row * stride + col; - }); - auto flat_idx_it = oneapi::dpl::make_permutation_iterator( - local_mtx->get_values(), flat_idx_map.get_data()); - - auto is_local_row = [part_ids, local_part](const auto t) { - return part_ids[std::get<1>(t)] == local_part; - }; - auto value_rid_it = oneapi::dpl::make_zip_iterator( - input.get_const_values(), range_id.get_const_data()); - oneapi::dpl::copy_if(policy, value_rid_it, - value_rid_it + input.get_num_elems(), - oneapi::dpl::make_zip_iterator( - flat_idx_it, oneapi::dpl::discard_iterator()), - is_local_row); -} + comm_index_type local_part, + matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt index 4efad911912..7affdfc066d 100644 --- a/test/distributed/CMakeLists.txt +++ b/test/distributed/CMakeLists.txt @@ -1,2 +1,2 @@ ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp) -ginkgo_create_common_test(vector_kernels) +ginkgo_create_common_test(vector_kernels DISABLE_EXECUTORS dpcpp) diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/distributed/vector.cpp index 23dd87ba062..9d6a8deb443 100644 --- a/test/mpi/distributed/vector.cpp +++ b/test/mpi/distributed/vector.cpp @@ -147,7 +147,7 @@ class VectorCreation : public ::testing::Test { TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes); -//#ifdef GKO_COMPILING_REFERENCE +#ifndef GKO_COMPILING_DPCPP TYPED_TEST(VectorCreation, CanReadGlobalMatrixData) @@ -321,7 +321,7 @@ TYPED_TEST(VectorCreation, CanReadLocalMatrixDataSomeEmpty) } -//#endif +#endif TYPED_TEST(VectorCreation, CanCreateFromLocalVectorAndSize) From 76317cb035ffa479aab975d48a7cc270db04ba45 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Thu, 2 Jun 2022 12:22:24 +0000 Subject: [PATCH 6/8] Format files Co-authored-by: Marcel Koch --- dpcpp/distributed/vector_kernels.dp.cpp | 1 - test/distributed/vector_kernels.cpp | 6 +++--- test/mpi/distributed/vector.cpp | 15 +++------------ 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp index f1a7e49d876..a51c9e22669 100644 --- a/dpcpp/distributed/vector_kernels.dp.cpp +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include "core/distributed/vector_kernels.hpp" diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index a5bd8a4dfaa..fa32756f560 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -30,6 +30,9 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include "core/distributed/vector_kernels.hpp" + + #include #include #include @@ -43,10 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/distributed/vector_kernels.hpp" #include "core/test/utils.hpp" - - #include "test/utils/executor.hpp" diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/distributed/vector.cpp index 9d6a8deb443..c7eb1305e19 100644 --- a/test/mpi/distributed/vector.cpp +++ b/test/mpi/distributed/vector.cpp @@ -118,10 +118,7 @@ class VectorCreation : public ::testing::Test { init_executor(gko::ReferenceExecutor::create(), exec, comm); } - void SetUp() override - { - ASSERT_EQ(this->comm.size(), 3); - } + void SetUp() override { ASSERT_EQ(this->comm.size(), 3); } void TearDown() override { @@ -421,10 +418,7 @@ class VectorReductions : public ::testing::Test { y = gko::clone(exec, tmp_y); } - void SetUp() override - { - ASSERT_GT(comm.size(), 0); - } + void SetUp() override { ASSERT_GT(comm.size(), 0); } void TearDown() override { @@ -597,10 +591,7 @@ class VectorLocalOps : public ::testing::Test { complex = complex_dist_vec_type::create(exec, comm); } - void SetUp() override - { - ASSERT_GT(comm.size(), 0); - } + void SetUp() override { ASSERT_GT(comm.size(), 0); } void TearDown() override { From ebdd6d5e0b16035651c2c43bde2d448e6556bc86 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Mon, 20 Jun 2022 15:15:12 +0200 Subject: [PATCH 7/8] review updates: - formatting Co-authored-by: Tobias Ribizel --- .../distributed/vector_kernels.hpp.inc | 20 +++++++++---------- test/distributed/vector_kernels.cpp | 7 +++---- third_party/gtest/CMakeLists.txt | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc index 504f7153062..f99fc1fd28c 100644 --- a/common/cuda_hip/distributed/vector_kernels.hpp.inc +++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc @@ -59,13 +59,12 @@ void build_local( // the flat_idx_it is used by the scatter_if as an index map for the values auto map_to_local_row = [range_bounds, range_starting_indices] __host__ __device__( - const thrust::tuple& idx_range_id) - -> LocalIndexType { - const auto idx = thrust::get<0>(idx_range_id); - const auto rid = thrust::get<1>(idx_range_id); - return static_cast(idx - range_bounds[rid]) + - range_starting_indices[rid]; - }; + const thrust::tuple& idx_range_id) { + const auto idx = thrust::get<0>(idx_range_id); + const auto rid = thrust::get<1>(idx_range_id); + return static_cast(idx - range_bounds[rid]) + + range_starting_indices[rid]; + }; auto local_row_it = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(input.get_const_row_idxs(), range_id.get_data())), @@ -74,10 +73,9 @@ void build_local( auto stride = local_mtx->get_stride(); auto map_to_flat_idx = [stride] __host__ __device__( - const thrust::tuple& row_col) - -> size_type { - return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); - }; + const thrust::tuple& row_col) { + return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); + }; auto flat_idx_it = thrust::make_transform_iterator( thrust::make_zip_iterator( thrust::make_tuple(local_row_it, input.get_const_col_idxs())), diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index fa32756f560..2254867c957 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -71,10 +71,9 @@ class Vector : public ::testing::Test { using global_entry = gko::matrix_data_entry; using mtx = gko::matrix::Dense; - Vector() : ref(gko::ReferenceExecutor::create()), engine(42) {} - - - void SetUp() { init_executor(ref, exec); } + Vector() : ref(gko::ReferenceExecutor::create()), engine(42) { + init_executor(ref, exec); + } void TearDown() { diff --git a/third_party/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt index 66b15b975ef..77698fd7a9f 100644 --- a/third_party/gtest/CMakeLists.txt +++ b/third_party/gtest/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG main + GIT_TAG release-1.11.0 ) # need to set the variables in CACHE due to CMP0077 set(gtest_disable_pthreads ON CACHE INTERNAL "") From 5daaddcbf98f82bfcf1a16b9f4fb4c764c39cdc9 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Mon, 20 Jun 2022 13:43:56 +0000 Subject: [PATCH 8/8] Format files Co-authored-by: Marcel Koch --- test/distributed/vector_kernels.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp index 2254867c957..93f7f8e10e5 100644 --- a/test/distributed/vector_kernels.cpp +++ b/test/distributed/vector_kernels.cpp @@ -71,7 +71,8 @@ class Vector : public ::testing::Test { using global_entry = gko::matrix_data_entry; using mtx = gko::matrix::Dense; - Vector() : ref(gko::ReferenceExecutor::create()), engine(42) { + Vector() : ref(gko::ReferenceExecutor::create()), engine(42) + { init_executor(ref, exec); }