From 5e7a858a0e1573e16b3b8fecf4fa54339b4718a5 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 24 Nov 2021 15:33:50 +0800 Subject: [PATCH 1/6] dpcpp port isai (row_limit=32) --- cuda/preconditioner/isai_kernels.cu | 3 +- dev_tools/oneapi/convert_source.sh | 3 +- dpcpp/base/math.hpp | 61 ++ dpcpp/components/intrinsics.dp.hpp | 88 +++ dpcpp/components/merging.dp.hpp | 337 ++++++++++ dpcpp/components/reduction.dp.hpp | 4 +- dpcpp/components/searching.dp.hpp | 260 ++++++++ dpcpp/components/warp_blas.dp.hpp | 474 ++++++++++++++ dpcpp/preconditioner/isai_kernels.dp.cpp | 670 +++++++++++++++++++- dpcpp/test/CMakeLists.txt | 1 + dpcpp/test/preconditioner/CMakeLists.txt | 1 + dpcpp/test/preconditioner/isai_kernels.cpp | 683 +++++++++++++++++++++ hip/preconditioner/isai_kernels.hip.cpp | 4 +- 13 files changed, 2550 insertions(+), 39 deletions(-) create mode 100644 dpcpp/base/math.hpp create mode 100644 dpcpp/components/intrinsics.dp.hpp create mode 100644 dpcpp/components/merging.dp.hpp create mode 100644 dpcpp/components/searching.dp.hpp create mode 100644 dpcpp/components/warp_blas.dp.hpp create mode 100644 dpcpp/test/preconditioner/CMakeLists.txt create mode 100644 dpcpp/test/preconditioner/isai_kernels.cpp diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu index 3cc0bebf123..96808c8bf03 100644 --- a/cuda/preconditioner/isai_kernels.cu +++ b/cuda/preconditioner/isai_kernels.cu @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" +#include "cuda/components/warp_blas.cuh" namespace gko { @@ -66,8 +67,6 @@ constexpr int subwarps_per_block{2}; constexpr int default_block_size{subwarps_per_block * subwarp_size}; -#include "common/cuda_hip/components/atomic.hpp.inc" -#include "common/cuda_hip/components/warp_blas.hpp.inc" #include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh index b0ed62252f7..209b9d7e8e6 100755 --- a/dev_tools/oneapi/convert_source.sh +++ b/dev_tools/oneapi/convert_source.sh @@ -16,8 +16,9 @@ # If GTEST_HEADER_DIR is available elsewhere GINKGO_BUILD_TESTS is not required. # CMake's step is not required if copying the ginkgo config.hpp from another ginkgo build into "${ROOT_DIR}/include/ginkgo/". # ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}" -# GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/third_party_gtest/src/googletest/include" +# GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include" # CLANG_FORMAT: the clang-format exec. The default is "clang-format" +# VERBOSE: if it is set as 1, script will ouput the path information CURRENT_DIR="$( pwd )" cd "$( dirname "${BASH_SOURCE[0]}" )" SCRIPT_DIR="$( pwd )" diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp new file mode 100644 index 00000000000..a9fa1a38a39 --- /dev/null +++ b/dpcpp/base/math.hpp @@ -0,0 +1,61 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_MATH_HPP_ +#define GKO_DPCPP_BASE_MATH_HPP_ + + +#include + + +#include +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +template +struct device_numeric_limits { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + +#endif // GKO_DPCPP_BASE_MATH_HPP_ diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp new file mode 100644 index 00000000000..c0e5a445120 --- /dev/null +++ b/dpcpp/components/intrinsics.dp.hpp @@ -0,0 +1,88 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_ + + +#include + + +#include + + +#include "dpcpp/base/dpct.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** + * @internal + * Returns the number of set bits in the given mask. + */ +__dpct_inline__ int popcnt(uint32 mask) { return sycl::popcount(mask); } + +/** @copydoc popcnt */ +__dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); } + + +/** + * @internal + * Returns the (1-based!) index of the first set bit in the given mask, + * starting from the least significant bit. + */ +__dpct_inline__ int ffs(uint32 mask) { return 32 - sycl::clz(mask); } + +/** @copydoc ffs */ +__dpct_inline__ int ffs(uint64 mask) { return 64 - sycl::clz(mask); } + + +/** + * @internal + * Returns the number of zero bits before the first set bit in the given mask, + * starting from the most significant bit. + */ +__dpct_inline__ int clz(uint32 mask) { return sycl::clz(mask); } + +/** @copydoc clz */ +__dpct_inline__ int clz(uint64 mask) { return sycl::clz(mask); } + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_ diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp new file mode 100644 index 00000000000..67ee87f1b60 --- /dev/null +++ b/dpcpp/components/merging.dp.hpp @@ -0,0 +1,337 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_ + + +#include + + +#include + + +#include "core/base/utils.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/base/math.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" +#include "dpcpp/components/searching.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace detail { + + +/** + * @internal + * The result from the @ref group_merge_step function. + */ +template +struct merge_result { + /** The element of a being merged in the current thread. */ + ValueType a_val; + /** The element of b being merged in the current thread. */ + ValueType b_val; + /** The index from a that is being merged in the current thread. */ + int a_idx; + /** The index from b that is being merged in the current thread. */ + int b_idx; + /** The number of elements from a that have been merged in total. */ + int a_advance; + /** The number of elements from b that have been merged in total. */ + int b_advance; +}; + +} // namespace detail + + +/** + * @internal + * Warp-parallel merge algorithm that merges the first `warp_size` elements from + * two ranges, where each warp stores a single element from each range. + * It assumes that the elements are sorted in ascending order, i.e. for i < j, + * the value of `a` at thread i is smaller or equal to the value at thread j, + * and the same holds for `b`. + * + * This implementation is based on ideas from Green et al., + * "GPU merge path: a GPU merging algorithm", but uses random-access warp + * shuffles instead of shared-memory to exchange values of a and b. + * + * @param a the element from the first range + * @param b the element from the second range + * @param size the number of elements in the output range + * @param group the cooperative group that executes the merge + * @return a structure containing the merge result distributed over the group. + */ +template +__dpct_inline__ detail::merge_result group_merge_step(ValueType a, + ValueType b, + Group group) +{ + // thread i takes care of ith element of the merged sequence + auto i = int(group.thread_rank()); + + // we want to find the smallest index `x` such that a[x] >= b[i - x - 1] + // or `i` if no such index exists + // + // if x = i then c[0...i - 1] = a[0...i - 1] + // => merge a[i] with b[0] + // if x = 0 then c[0...i - 1] = b[0...i - 1] + // => merge a[0] with b[i] + // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1] + // because the minimality of `x` implies + // b[i - x] >= a[x - 1] + // and a[x] >= a[0...x - 1], b[0...i - x - 1] + // => merge a[x] with b[i - x] + auto minx = synchronous_fixed_binary_search([&](int x) { + auto a_remote = group.shfl(a, x); + auto b_remote = group.shfl(b, max(i - x - 1, 0)); + return a_remote >= b_remote || x >= i; + }); + + auto a_idx = minx; + auto b_idx = max(i - minx, 0); + auto a_val = group.shfl(a, a_idx); + auto b_val = group.shfl(b, b_idx); + auto cmp = a_val < b_val; + auto a_advance = popcnt(group.ballot(cmp)); + auto b_advance = int(group.size()) - a_advance; + + return {a_val, b_val, a_idx, b_idx, a_advance, b_advance}; +} + + +/** + * @internal + * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fn` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param merge_fn the callback that is being called for each merged element. + * It takes six parameters: + * `IndexType a_idx, ValueType a_val, IndexType b_idx, + * ValueType b_val, IndexType c_index, bool valid`. + * `*_val` and `*_idx` are the values resp. the indices of the + * values from a/b being compared at output index `c_index`. + * `valid` specifies if the current thread has to merge an + * element (this is necessary for shfl and ballot operations). + * It must return `false` on all threads of the group iff the + * merge shouldn't be continued. + */ +template +__dpct_inline__ void group_merge(const ValueType* __restrict__ a, + IndexType a_size, + const ValueType* __restrict__ b, + IndexType b_size, Group group, + Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + auto lane = static_cast(group.thread_rank()); + auto sentinel = device_numeric_limits::max; + auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel); + auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel); + for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) { + auto merge_result = group_merge_step(a_cur, b_cur, group); + auto valid = c_begin + lane < c_size; + auto cont = merge_fn(merge_result.a_idx + a_begin, merge_result.a_val, + merge_result.b_idx + b_begin, merge_result.b_val, + c_begin + lane, valid); + if (!group.any(cont && valid)) { + break; + } + auto a_advance = merge_result.a_advance; + auto b_advance = merge_result.b_advance; + a_begin += a_advance; + b_begin += b_advance; + + // shuffle the unmerged elements to the front + a_cur = group.shfl_down(a_cur, a_advance); + b_cur = group.shfl_down(b_cur, b_advance); + /* + * To optimize memory access, we load the new elements for `a` and `b` + * with a single load instruction: + * the lower part of the group loads new elements for `a` + * the upper part of the group loads new elements for `b` + * `load_lane` is the part-local lane idx + * The elements for `a` have to be shuffled up afterwards. + */ + auto load_a = lane < a_advance; + auto load_lane = load_a ? lane : lane - a_advance; + auto load_source = load_a ? a : b; + auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance; + auto load_size = load_a ? a_size : b_size; + + auto load_idx = load_begin + load_lane; + auto loaded = checked_load(load_source, load_idx, load_size, sentinel); + // shuffle the `a` values to the end of the warp + auto lower_loaded = group.shfl_up(loaded, b_advance); + a_cur = lane < b_advance ? a_cur : lower_loaded; + b_cur = lane < a_advance ? b_cur : loaded; + } +} + + +/** + * @internal + * Warp-parallel merge algorithm that reports matching elements from two sorted + * ranges of arbitrary size. `merge_fn` will be called for each pair of matching + * element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param group the group that executes the merge + * @param match_fn the callback that is being called for each matching pair. + * It takes five parameters: + * `ValueType val, IndexType a_idx, IndexType b_idx, + * lane_mask_type match_mask, bool valid`. + * `val` is the matching element, `*_idx` are the indices of + * the matching values from a and b, match_mask is a lane mask + * that is 1 for every subwarp lane that found a match. + * `valid` is true iff there is actually a match. + * (necessary for warp-synchronous operations) + */ +template +__dpct_inline__ void group_match(const ValueType* __restrict__ a, + IndexType a_size, + const ValueType* __restrict__ b, + IndexType b_size, Group group, + Callback match_fn) +{ + group_merge( + a, a_size, b, b_size, group, + [&](IndexType a_idx, ValueType a_val, IndexType b_idx, ValueType b_val, + IndexType, bool valid) { + auto matchmask = group.ballot(a_val == b_val && valid); + match_fn(a_val, a_idx, b_idx, matchmask, a_val == b_val && valid); + return a_idx < a_size && b_idx < b_size; + }); +} + + +/** + * @internal + * Sequential merge algorithm that merges two sorted ranges of arbitrary + * size. `merge_fn` will be called for each merged element. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param merge_fn the callback that will be called for each merge step. + * It takes five parameters: + * `IndexType a_idx, ValueType a_val, + * IndexType b_idx, ValueType b_val, IndexType c_idx`. + * `*_val` and `*_idx` are the values resp. the indices of + * the values from a/b being compared in step `c_idx`. + * It must return `false` iff the merge should stop. + */ +template +__dpct_inline__ void sequential_merge(const ValueType* __restrict__ a, + IndexType a_size, + const ValueType* __restrict__ b, + IndexType b_size, Callback merge_fn) +{ + auto c_size = a_size + b_size; + IndexType a_begin{}; + IndexType b_begin{}; + auto sentinel = device_numeric_limits::max; + auto a_cur = checked_load(a, a_begin, a_size, sentinel); + auto b_cur = checked_load(b, b_begin, b_size, sentinel); + for (IndexType c_begin{}; c_begin < c_size; c_begin++) { + auto cont = merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin); + if (!cont) { + break; + } + auto a_advance = a_cur < b_cur; + auto b_advance = !a_advance; + a_begin += a_advance; + b_begin += b_advance; + + auto load = a_advance ? a : b; + auto load_size = a_advance ? a_size : b_size; + auto load_idx = a_advance ? a_begin : b_begin; + auto loaded = checked_load(load, load_idx, load_size, sentinel); + a_cur = a_advance ? loaded : a_cur; + b_cur = b_advance ? loaded : b_cur; + } +} + + +/** + * @internal + * Sequential algorithm that finds matching elements in two sorted ranges of + * arbitrary size. `merge_fn` will be called for each pair of matching + * elements. + * + * @param a the first range + * @param a_size the size of the first range + * @param b the second range + * @param b_size the size of the second range + * @param match_fn the callback that is being called for each match. + * It takes three parameters: + * `ValueType val, IndexType a_idx, IndexType b_idx`. + * `val` is the matching element, `*_idx` are the + * indices of the matching values from a and b. + */ +template +__dpct_inline__ void sequential_match(const ValueType* a, IndexType a_size, + const ValueType* b, IndexType b_size, + Callback match_fn) +{ + sequential_merge(a, a_size, b, b_size, + [&](IndexType a_idx, ValueType a_val, IndexType b_idx, + ValueType b_val, IndexType) { + if (a_val == b_val) { + match_fn(a_val, a_idx, b_idx); + } + return a_idx < a_size && b_idx < b_size; + }); +} + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_ diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index b1eb86a1992..74cbb66c451 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -114,8 +114,8 @@ __dpct_inline__ int choose_pivot(const Group& group, ValueType local_data, { using real = remove_complex; real lmag = is_pivoted ? -one() : abs(local_data); - const auto pivot = - reduce(group, group.thread_rank(), [&](int lidx, int ridx) { + const auto pivot = ::gko::kernels::dpcpp::reduce( + group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); if (rmag > lmag) { lmag = rmag; diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp new file mode 100644 index 00000000000..4db6f1f7073 --- /dev/null +++ b/dpcpp/components/searching.dp.hpp @@ -0,0 +1,260 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_ + + +#include + + +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/intrinsics.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** + * @internal + * Generic binary search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * The implementation is based on Stepanov & McJones, "Elements of Programming". + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__dpct_inline__ IndexType binary_search(IndexType offset, IndexType length, + Predicate p) +{ + while (length > 0) { + auto half_length = length / 2; + auto mid = offset + half_length; + auto pred = p(mid); + length = pred ? half_length : length - (half_length + 1); + offset = pred ? offset : mid + 1; + } + return offset; +} + + +/** + * @internal + * Generic implementation of a fixed-size binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * @tparam size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__dpct_inline__ int synchronous_fixed_binary_search(Predicate p) +{ + if (size == 0) { + return 0; + } + int begin{}; + static_assert(size > 0, "size must be positive"); + static_assert(!(size & (size - 1)), "size must be a power of two"); +#pragma unroll + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic implementation of a synchronous binary search. + * The implementation makes sure that the number of predicate evaluations only + * depends on `length` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * @param size the length of the partitioned range - must be a power of two + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `int` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__dpct_inline__ int synchronous_binary_search(int size, Predicate p) +{ + if (size == 0) { + return 0; + } + int begin{}; + for (auto cur_size = size; cur_size > 1; cur_size /= 2) { + auto half_size = cur_size / 2; + auto mid = begin + half_size; + // invariant: [begin, begin + cur_size] contains partition point + begin = p(mid) ? begin : mid; + } + // cur_size is now 1, so the partition point is either begin or begin + 1 + return p(begin) ? begin : begin + 1; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * It executes `log2(length / group.size())` coalescing calls to `p`. + * + * This implementation is based on the w-wide search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__dpct_inline__ IndexType group_wide_search(IndexType offset, IndexType length, + Group group, Predicate p) +{ + // binary search on the group-sized blocks + IndexType num_blocks = (length + group.size() - 1) / group.size(); + auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) { + auto idx = i * group.size(); + return p(offset + idx); + }); + // case 1: p is true everywhere: middle is at the beginning + if (group_pos == 0) { + return offset; + } + /* + * case 2: p is false somewhere: + * + * p(group_pos * g.size()) is true, so either this is the partition point, + * or the partition point is one of the g.size() - 1 previous indices. + * |block group_pos-1| + * 0 | 0 * * * * * * * | 1 + * ^ ^ + * we load this range, with the 1 acting as a sentinel for ffs(...) + * + * additionally, this means that we can't call p out-of-bounds + */ + auto base_idx = (group_pos - 1) * group.size() + 1; + auto idx = base_idx + group.thread_rank(); + auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1; + return offset + base_idx + pos; +} + + +/** + * @internal + * Generic search that finds the first index where a predicate is true. + * It assumes that the predicate partitions the range [offset, offset + length) + * into two subranges [offset, middle), [middle, offset + length) such that + * the predicate is `false` for all elements in the first range and `true` for + * all elements in the second range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * + * It executes `log2(length) / log2(group.size())` calls to `p` that effectively + * follow a random-access pattern. + * + * This implementation is based on the w-partition search mentioned in + * Green et al., "GPU merge path: a GPU merging algorithm" + * + * @param offset the starting index of the partitioned range + * @param length the length of the partitioned range + * @param group the coalescing group executing the search + * @param p the predicate to be evaluated on the range - it should not have + * side-effects and map from `IndexType` to `bool` + * @returns the index of `middle`, i.e., the partition point + */ +template +__dpct_inline__ IndexType group_ary_search(IndexType offset, IndexType length, + Group group, Predicate p) +{ + IndexType end = offset + length; + // invariant: [offset, offset + length] contains middle + while (length > group.size()) { + auto stride = length / group.size(); + auto idx = offset + group.thread_rank() * stride; + auto mask = group.ballot(p(idx)); + // if the mask is 0, the partition point is in the last block + // if the mask is ~0, the partition point is in the first block + // otherwise, we go to the last block that returned a 0. + auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1; + auto last_length = length - stride * (group.size() - 1); + length = pos == group.size() - 1 ? last_length : stride; + offset += stride * pos; + } + auto idx = offset + group.thread_rank(); + // if the mask is 0, the partition point is at the end + // otherwise it is the first set bit + auto mask = group.ballot(idx >= end || p(idx)); + auto pos = mask == 0 ? group.size() : ffs(mask) - 1; + return offset + pos; +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_ diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp new file mode 100644 index 00000000000..7754cfb260d --- /dev/null +++ b/dpcpp/components/warp_blas.dp.hpp @@ -0,0 +1,474 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_ +#define GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_ + + +#include +// #include +#include + + +#include + + +#include + + +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +/** + * @internal + * + * Defines a postprocessing transformation that should be performed on the + * result of a function call. + * + * @note This functionality should become useless once accessors and ranges are + * in place, as they will define the storage scheme. + */ +enum postprocess_transformation { and_return, and_transpose }; + + +/** + * @internal + * + * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan + * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the + * thread group `group. Each thread contributes one `row` of the matrix, and the + * routine uses warp shuffles to exchange data between rows. The transform is + * performed by using the `key_row`-th row and `key_col`-th column of the + * matrix. + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ void apply_gauss_jordan_transform( + const Group& __restrict__ group, int32 key_row, int32 key_col, + ValueType* __restrict__ row, bool& __restrict__ status) +{ + auto key_col_elem = group.shfl(row[key_col], key_row); + if (key_col_elem == zero()) { + // TODO: implement error handling for GPUs to be able to properly + // report it here + status = false; + return; + } + if (group.thread_rank() == key_row) { + key_col_elem = one() / key_col_elem; + } else { + key_col_elem = -row[key_col] / key_col_elem; + } +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + const auto key_row_elem = group.shfl(row[i], key_row); + if (group.thread_rank() == key_row) { + row[i] = zero(); + } + row[i] += key_col_elem * key_row_elem; + } + row[key_col] = key_col_elem; +} + + +/** + * @internal + * + * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan + * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the + * thread group `group. Each thread contributes one `row` of the matrix, and the + * routine uses warp shuffles to exchange data between rows. The transform is + * performed by using the `key_row`-th row and `key_col`-th column of the + * matrix. + * Works with one right hand side vector `rhs` which can be directly worked on + * when solving Ax = rhs without the need of storing the inverse of A. + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ void apply_gauss_jordan_transform_with_rhs( + const Group& __restrict__ group, int32 key_row, int32 key_col, + ValueType* __restrict__ row, ValueType* __restrict__ rhs, + bool& __restrict__ status) +{ + auto key_col_elem = group.shfl(row[key_col], key_row); + auto key_rhs_elem = group.shfl(rhs[0], key_row); + if (key_col_elem == zero()) { + // TODO: implement error handling for GPUs to be able to properly + // report it here + status = false; + return; + } + if (group.thread_rank() == key_row) { + key_col_elem = one() / key_col_elem; + rhs[0] = key_rhs_elem * key_col_elem; + } else { + key_col_elem = -row[key_col] / key_col_elem; + rhs[0] += key_rhs_elem * key_col_elem; + } +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + const auto key_row_elem = group.shfl(row[i], key_row); + if (group.thread_rank() == key_row) { + row[i] = zero(); + } + // rhs[0] += key_rhs_elem * key_row_elem; + row[i] += key_col_elem * key_row_elem; + } + row[key_col] = key_col_elem; +} + + +/** + * @internal + * + * Inverts a matrix using Gauss-Jordan elimination. The inversion is + * done in-place, so the original matrix will be overridden with the inverse. + * The inversion routine uses implicit pivoting, so the returned matrix will be + * a permuted inverse (from both sides). To obtain the correct inverse, the + * rows of the result should be permuted with $P$, and the columns with + * $ P^T $ (i.e. $ A^{-1} = P X P $, where $ X $ is the returned matrix). These + * permutation matrices are returned compressed as vectors `perm` + * and`trans_perm`, respectively. `i`-th value of each of the vectors is + * returned to thread of the group with rank `i`. + * + * @tparam max_problem_size the maximum problem size that will be passed to the + * inversion routine (a tighter bound results in + * faster code + * @tparam Group type of the group of threads + * @tparam ValueType type of values stored in the matrix + * + * @param group the group of threads which participate in the inversion + * @param problem_size the actual size of the matrix (cannot be larger than + * max_problem_size) + * @param row a pointer to the matrix row (i-th thread in the group should + * pass the pointer to the i-th row), has to have at least + * max_problem_size elements + * @param perm a value to hold an element of permutation matrix $ P $ + * @param trans_perm a value to hold an element of permutation matrix $ P^T $ + * + * @return true if the inversion succeeded, false otherwise + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ bool invert_block(const Group& __restrict__ group, + uint32 problem_size, + ValueType* __restrict__ row, + uint32& __restrict__ perm, + uint32& __restrict__ trans_perm) +{ + GKO_ASSERT(problem_size <= max_problem_size); + // prevent rows after problem_size to become pivots + auto pivoted = group.thread_rank() >= problem_size; + auto status = true; +#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS +#pragma unroll +#else +#pragma unroll 1 +#endif + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + const auto piv = choose_pivot(group, row[i], pivoted); + if (group.thread_rank() == piv) { + perm = i; + pivoted = true; + } + if (group.thread_rank() == i) { + trans_perm = piv; + } + apply_gauss_jordan_transform(group, piv, i, row, + status); + } + } + return status; +} + + +/** + * @internal + * + * Performs the correct index calculation for the given postprocess operation. + */ +template +__dpct_inline__ auto get_row_major_index(T1 row, T2 col, T3 stride) -> + typename std::enable_if< + mod != and_transpose, + typename std::decay::type>::type +{ + return row * stride + col; +} + + +template +__dpct_inline__ auto get_row_major_index(T1 row, T2 col, T3 stride) -> + typename std::enable_if< + mod == and_transpose, + typename std::decay::type>::type +{ + return col * stride + row; +} + + +/** + * @internal + * + * Copies a matrix stored as a collection of rows in different threads of the + * warp in a block of memory accessible by all threads in row-major order. + * Optionally permutes rows and columns of the matrix in the process. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam mod the transformation to perform on the return data + * @tparam Group type of the group of threads + * @tparam SourceValueType type of values stored in the source matrix + * @tparam ResultValueType type of values stored in the result matrix + * + * @param group group of threads participating in the copy + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param source_row pointer to memory used to store a row of the source matrix + * `i`-th thread of the sub-warp should pass in the `i`-th + * row of the matrix + * @param increment offset between two consecutive elements of the row + * @param row_perm permutation vector to apply on the rows of the matrix + * (thread `i` supplies the `i`-th value of the vector) + * @param col_perm permutation vector to apply on the column of the matrix + * (thread `i` supplies the `i`-th value of the vector) + * @param destination pointer to memory where the result will be stored + * (all threads supply the same value) + * @param stride offset between two consecutive rows of the matrix + */ +template < + int max_problem_size, postprocess_transformation mod = and_return, + typename Group, typename SourceValueType, typename ResultValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ void copy_matrix(const Group& __restrict__ group, + uint32 problem_size, + const SourceValueType* __restrict__ source_row, + uint32 increment, uint32 row_perm, + uint32 col_perm, + ResultValueType* __restrict__ destination, + size_type stride) +{ + GKO_ASSERT(problem_size <= max_problem_size); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + const auto idx = group.shfl(col_perm, i); + if (group.thread_rank() < problem_size) { + // Need to assign a variable for the source_row, or hip + // will use a lot of VGPRs in unroll. This might lead to + // problems. + const auto val = source_row[i * increment]; + destination[get_row_major_index(idx, row_perm, stride)] = + static_cast(val); + } + } + } +} + + +/** + * @internal + * + * Multiplies a transposed vector and a matrix stored in column-major order. + * + * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam MatrixValueType type of values stored in the matrix + * @tparam VectorValueType type of values stored in the vectors + * + * @param group group of threads participating in the operation + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param vec input vector to multiply (thread `i` supplies the `i`-th value of + * the vector) + * @param mtx_row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the sub-warp should pass in the + * `i`-th row of the matrix + * @param mtx_increment offset between two consecutive elements of the row + * @param res pointer to a block of memory where the result will be written + * (only thread 0 of the group has to supply a valid value) + * @param mtx_increment offset between two consecutive elements of the result + */ +template < + int max_problem_size, typename Group, typename MatrixValueType, + typename VectorValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ void multiply_transposed_vec( + const Group& __restrict__ group, uint32 problem_size, + const VectorValueType& __restrict__ vec, + const MatrixValueType* __restrict__ mtx_row, uint32 mtx_increment, + VectorValueType* __restrict__ res, uint32 res_increment) +{ + GKO_ASSERT(problem_size <= max_problem_size); + auto mtx_elem = zero(); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + if (group.thread_rank() < problem_size) { + mtx_elem = + static_cast(mtx_row[i * mtx_increment]); + } + const auto out = ::gko::kernels::dpcpp::reduce( + group, mtx_elem * vec, + [](VectorValueType x, VectorValueType y) { return x + y; }); + if (group.thread_rank() == 0) { + res[i * res_increment] = out; + } + } + } +} + + +/** + * @internal + * + * Multiplies a matrix and a vector stored in column-major order. + * + * In mathematical terms, performs the operation $res = mtx \cdot vec$. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam MatrixValueType type of values stored in the matrix + * @tparam VectorValueType type of values stored in the vectors + * @tparam Closure type of the function used to write the result + * + * @param group group of threads participating in the operation + * @param problem_size actual size of the matrix + * (`problem_size <= max_problem_size`) + * @param vec input vector to multiply (thread `i` supplies the `i`-th value of + * the vector) + * @param mtx_row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the sub-warp should pass in the + * `i`-th row of the matrix + * @param mtx_increment offset between two consecutive elements of the row + * @param res pointer to a block of memory where the result will be written + * (only thread 0 of the group has to supply a valid value) + * @param mtx_increment offset between two consecutive elements of the result + * @param closure_op Operation that is performed when writing to + `res[group.thread_rank() * res_increment]` as + `closure_op(res[group.thread_rank() * res_increment], out)` + where `out` is the result of $mtx \cdot vec$. + */ +template < + int max_problem_size, typename Group, typename MatrixValueType, + typename VectorValueType, typename Closure, + typename = std::enable_if_t::value>> +__dpct_inline__ void multiply_vec(const Group& __restrict__ group, + uint32 problem_size, + const VectorValueType& __restrict__ vec, + const MatrixValueType* __restrict__ mtx_row, + uint32 mtx_increment, + VectorValueType* __restrict__ res, + uint32 res_increment, Closure closure_op) +{ + GKO_ASSERT(problem_size <= max_problem_size); + auto mtx_elem = zero(); + auto out = zero(); +#pragma unroll + for (int32 i = 0; i < max_problem_size; ++i) { + if (i < problem_size) { + if (group.thread_rank() < problem_size) { + mtx_elem = + static_cast(mtx_row[i * mtx_increment]); + } + out += mtx_elem * group.shfl(vec, i); + } + } + if (group.thread_rank() < problem_size) { + closure_op(res[group.thread_rank() * res_increment], out); + } +} + + +/** + * @internal + * + * Computes the infinity norm of a matrix. Each thread in the group supplies + * one row of the matrix. + * + * @tparam max_problem_size maximum problem size passed to the routine + * @tparam Group type of the group of threads + * @tparam ValueType type of values stored in the matrix + * + * @param group group of threads participating in the operation + * @param num_rows number of rows of the matrix + * (`num_rows <= max_problem_size`) + * @param num_cols number of columns of the matrix + * @param row pointer to memory used to store a row of the input matrix, + * `i`-th thread of the group should pass in the `i`-th row of the + * matrix + * + * @return the infinity norm of the matrix + */ +template < + int max_problem_size, typename Group, typename ValueType, + typename = std::enable_if_t::value>> +__dpct_inline__ remove_complex compute_infinity_norm( + const Group& group, uint32 num_rows, uint32 num_cols, const ValueType* row) +{ + using result_type = remove_complex; + auto sum = zero(); + if (group.thread_rank() < num_rows) { +#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS +#pragma unroll +#else +#pragma unroll 1 +#endif + for (uint32 i = 0; i < max_problem_size; ++i) { + if (i < num_cols) { + sum += abs(row[i]); + } + } + } + return ::gko::kernels::dpcpp::reduce( + group, sum, [](result_type x, result_type y) { return max(x, y); }); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_ diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index 63812f8ba2b..ef4a64c59e2 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -33,21 +33,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/isai_kernels.hpp" -#include -#include - - #include -#include #include -#include +#include #include #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/merging.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" +#include "dpcpp/components/warp_blas.dp.hpp" namespace gko { @@ -55,32 +59,593 @@ namespace kernels { namespace dpcpp { /** * @brief The Isai preconditioner namespace. - * + * @ref Isai * @ingroup isai */ namespace isai { -template -void forall_matching(const IndexType* fst, IndexType fst_size, - const IndexType* snd, IndexType snd_size, - Callback cb) GKO_NOT_IMPLEMENTED; +constexpr int subwarp_size{row_size_limit}; +constexpr int subwarps_per_block{2}; +constexpr int default_block_size{subwarps_per_block * subwarp_size}; + + +namespace kernel { -template -void generic_generate(std::shared_ptr exec, - const matrix::Csr* mtx, - matrix::Csr* inverse_mtx, - IndexType* excess_rhs_ptrs, IndexType* excess_nz_ptrs, - Callable trs_solve) GKO_NOT_IMPLEMENTED; +/** + * @internal + * + * This kernel supports at most `subwarp_size` (< `warp_size`) elements per row. + * If there are more elements, they are simply ignored. Only the first + * `subwarp_size` elements are considered both for the values and for the + * sparsity pattern. + */ +template +__dpct_inline__ void generic_generate( + IndexType num_rows, const IndexType* __restrict__ m_row_ptrs, + const IndexType* __restrict__ m_col_idxs, + const ValueType* __restrict__ m_values, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, + IndexType* __restrict__ excess_rhs_sizes, + IndexType* __restrict__ excess_nnz, Callable direct_solve, + sycl::nd_item<3> item_ct1, + UninitializedArray* storage) +{ + static_assert(subwarp_size >= row_size_limit, "incompatible subwarp_size"); + const auto row = + thread::get_subwarp_id_flat(item_ct1); + + if (row >= num_rows) { + return; + } + + const auto i_row_begin = i_row_ptrs[row]; + const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin; + + auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const int local_id = subwarp.thread_rank(); + + if (i_row_size > subwarp_size) { + // defer long rows: store their nnz and number of matches + IndexType count{}; + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the sparse submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType, IndexType, IndexType, + config::lane_mask_type matchmask, + bool) { count += popcnt(matchmask); }); + } + // store the dim and nnz of this sparse block + if (local_id == 0) { + excess_rhs_sizes[row] = i_row_size; + excess_nnz[row] = count; + } + } else { + // handle short rows directly: no excess + if (local_id == 0) { + excess_rhs_sizes[row] = 0; + excess_nnz[row] = 0; + } + + // subwarp_size^2 storage per subwarp + + + auto dense_system_ptr = + *storage + (item_ct1.get_local_id(2) / subwarp_size) * + subwarp_size * subwarp_size; + // row-major accessor + auto dense_system = [&](IndexType row, IndexType col) -> ValueType& { + return dense_system_ptr[row * subwarp_size + col]; + }; + +#pragma unroll + for (int i = 0; i < subwarp_size; ++i) { + dense_system(i, local_id) = zero(); + } + + subwarp.sync(); + + IndexType rhs_one_idx{}; + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the dense submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType, IndexType m_idx, IndexType i_idx, + config::lane_mask_type, bool valid) { + rhs_one_idx += popcnt(subwarp.ballot( + valid && m_col_idxs[m_row_begin + m_idx] < row && + col == row)); + if (valid) { + dense_system(nz, i_idx) = m_values[m_row_begin + m_idx]; + } + }); + } + + subwarp.sync(); + + // Now, read a full col of `dense_system` into local registers, which + // will be row elements after this (implicit) transpose + ValueType local_row[subwarp_size]; +#pragma unroll + for (int i = 0; i < subwarp_size; ++i) { + local_row[i] = dense_system(i, local_id); + } + + const auto rhs = + direct_solve(i_row_size, local_row, subwarp, rhs_one_idx); + + // Write back: + if (local_id < i_row_size) { + const auto idx = i_row_begin + local_id; + if (is_finite(rhs)) { + i_values[idx] = rhs; + } else { + i_values[idx] = i_col_idxs[idx] == row ? one() + : zero(); + } + } + } +} + + +template +void generate_l_inverse( + IndexType num_rows, const IndexType* __restrict__ m_row_ptrs, + const IndexType* __restrict__ m_col_idxs, + const ValueType* __restrict__ m_values, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, + IndexType* __restrict__ excess_rhs_sizes, + IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1, + UninitializedArray* storage) +{ + auto trs_solve = + [](IndexType num_elems, const ValueType* __restrict__ local_row, + group::thread_block_tile& subwarp, size_type) { + const int local_id = subwarp.thread_rank(); + ValueType rhs = local_id == num_elems - 1 ? one() + : zero(); + // Solve Triangular system + for (int d_col = num_elems - 1; d_col >= 0; --d_col) { + const auto elem = local_row[d_col]; + if (d_col == local_id) { + rhs /= elem; + } + + const ValueType bot = subwarp.shfl(rhs, d_col); + if (local_id < d_col) { + rhs -= bot * elem; + } + } + + return rhs; + }; + + generic_generate( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, + i_values, excess_rhs_sizes, excess_nnz, trs_solve, item_ct1, storage); +} + +template +void generate_l_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, + sycl::queue* queue, IndexType num_rows, + const IndexType* m_row_ptrs, + const IndexType* m_col_idxs, const ValueType* m_values, + const IndexType* i_row_ptrs, + const IndexType* i_col_idxs, ValueType* i_values, + IndexType* excess_rhs_sizes, IndexType* excess_nnz) +{ + queue->submit([&](sycl::handler& cgh) { + sycl::accessor< + UninitializedArray, + 0, sycl::access_mode::read_write, sycl::access::target::local> + storage_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate_l_inverse( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, + i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, + item_ct1, storage_acc_ct1.get_pointer().get()); + }); + }); +} + + +template +void generate_u_inverse( + IndexType num_rows, const IndexType* __restrict__ m_row_ptrs, + const IndexType* __restrict__ m_col_idxs, + const ValueType* __restrict__ m_values, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, + IndexType* __restrict__ excess_rhs_sizes, + IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1, + UninitializedArray* storage) +{ + auto trs_solve = [](IndexType num_elems, + const ValueType* __restrict__ local_row, + group::thread_block_tile& subwarp, + size_type) { + const int local_id = subwarp.thread_rank(); + ValueType rhs = local_id == 0 ? one() : zero(); + // Solve Triangular system + for (int d_col = 0; d_col < num_elems; ++d_col) { + const auto elem = local_row[d_col]; + if (d_col == local_id) { + rhs /= elem; + } + + const ValueType top = subwarp.shfl(rhs, d_col); + if (d_col < local_id) { + rhs -= top * elem; + } + } + + return rhs; + }; + + generic_generate( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, + i_values, excess_rhs_sizes, excess_nnz, trs_solve, item_ct1, storage); +} + +template +void generate_u_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, + sycl::queue* queue, IndexType num_rows, + const IndexType* m_row_ptrs, + const IndexType* m_col_idxs, const ValueType* m_values, + const IndexType* i_row_ptrs, + const IndexType* i_col_idxs, ValueType* i_values, + IndexType* excess_rhs_sizes, IndexType* excess_nnz) +{ + queue->submit([&](sycl::handler& cgh) { + sycl::accessor< + UninitializedArray, + 0, sycl::access_mode::read_write, sycl::access::target::local> + storage_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate_u_inverse( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, + i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, + item_ct1, storage_acc_ct1.get_pointer().get()); + }); + }); +} + + +template +void generate_general_inverse( + IndexType num_rows, const IndexType* __restrict__ m_row_ptrs, + const IndexType* __restrict__ m_col_idxs, + const ValueType* __restrict__ m_values, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values, + IndexType* __restrict__ excess_rhs_sizes, + IndexType* __restrict__ excess_nnz, bool spd, sycl::nd_item<3> item_ct1, + UninitializedArray* storage) +{ + auto general_solve = [spd](IndexType num_elems, + ValueType* __restrict__ local_row, + group::thread_block_tile& subwarp, + size_type rhs_one_idx) { + const int local_id = subwarp.thread_rank(); + ValueType rhs = + local_id == rhs_one_idx ? one() : zero(); + size_type perm = local_id; + auto pivoted = subwarp.thread_rank() >= num_elems; + auto status = true; + for (size_type i = 0; i < num_elems; i++) { + const auto piv = choose_pivot(subwarp, local_row[i], pivoted); + if (local_id == piv) { + pivoted = true; + } + if (local_id == i) { + perm = piv; + } + + apply_gauss_jordan_transform_with_rhs( + subwarp, piv, i, local_row, &rhs, status); + } + + ValueType sol = subwarp.shfl(rhs, perm); + + if (spd) { + auto diag = subwarp.shfl(sol, num_elems - 1); + sol /= std::sqrt(diag); + } + + return sol; + }; + + generic_generate( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, + i_values, excess_rhs_sizes, excess_nnz, general_solve, item_ct1, + storage); +} + +template +void generate_general_inverse( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + IndexType num_rows, const IndexType* m_row_ptrs, + const IndexType* m_col_idxs, const ValueType* m_values, + const IndexType* i_row_ptrs, const IndexType* i_col_idxs, + ValueType* i_values, IndexType* excess_rhs_sizes, IndexType* excess_nnz, + bool spd) +{ + queue->submit([&](sycl::handler& cgh) { + sycl::accessor< + UninitializedArray, + 0, sycl::access_mode::read_write, sycl::access::target::local> + storage_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate_general_inverse( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, + i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, spd, + item_ct1, storage_acc_ct1.get_pointer().get()); + }); + }); +} + + +template +void generate_excess_system(IndexType num_rows, + const IndexType* __restrict__ m_row_ptrs, + const IndexType* __restrict__ m_col_idxs, + const ValueType* __restrict__ m_values, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ i_col_idxs, + const IndexType* __restrict__ excess_rhs_ptrs, + const IndexType* __restrict__ excess_nz_ptrs, + IndexType* __restrict__ excess_row_ptrs, + IndexType* __restrict__ excess_col_idxs, + ValueType* __restrict__ excess_values, + ValueType* __restrict__ excess_rhs, + size_type e_start, size_type e_end, + sycl::nd_item<3> item_ct1) +{ + const auto row = + thread::get_subwarp_id_flat(item_ct1) + + e_start; + + if (row >= e_end) { + return; + } + + const auto i_row_begin = i_row_ptrs[row]; + const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin; + + auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const int local_id = subwarp.thread_rank(); + const auto prefix_mask = (config::lane_mask_type{1} << local_id) - 1; + + if (row == e_start && local_id == 0) { + excess_row_ptrs[0] = 0; + } + + if (i_row_size <= subwarp_size) { + return; + } + + auto excess_rhs_begin = excess_rhs_ptrs[row]; + auto excess_nz_begin = excess_nz_ptrs[row]; + + auto out_nz_begin = excess_nz_begin - excess_nz_ptrs[e_start]; + auto out_ptrs_begin = excess_rhs_begin - excess_rhs_ptrs[e_start]; + + // defer long rows: store their nnz and number of matches + for (IndexType nz = 0; nz < i_row_size; ++nz) { + auto col = i_col_idxs[i_row_begin + nz]; + auto m_row_begin = m_row_ptrs[col]; + auto m_row_size = m_row_ptrs[col + 1] - m_row_begin; + // extract the sparse submatrix consisting of the entries whose + // columns/rows match column indices from this row + group_match( + m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin, + i_row_size, subwarp, + [&](IndexType col, IndexType m_idx, IndexType i_idx, + config::lane_mask_type mask, bool valid) { + // dense_system(nz, i_idx) = m_values[m_row_begin + m_idx] + // only in sparse :) + if (valid) { + auto nz = out_nz_begin + popcnt(mask & prefix_mask); + excess_col_idxs[nz] = out_ptrs_begin + i_idx; + excess_values[nz] = m_values[m_row_begin + m_idx]; + } + out_nz_begin += popcnt(mask); + }); + if (local_id == 0) { + // build right-hand side: 1 for diagonal entry, 0 else + excess_rhs[out_ptrs_begin + nz] = + row == col ? one() : zero(); + // store row pointers + excess_row_ptrs[out_ptrs_begin + nz + 1] = out_nz_begin; + } + } +} + +template +void generate_excess_system( + dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, + IndexType num_rows, const IndexType* m_row_ptrs, + const IndexType* m_col_idxs, const ValueType* m_values, + const IndexType* i_row_ptrs, const IndexType* i_col_idxs, + const IndexType* excess_rhs_ptrs, const IndexType* excess_nz_ptrs, + IndexType* excess_row_ptrs, IndexType* excess_col_idxs, + ValueType* excess_values, ValueType* excess_rhs, size_type e_start, + size_type e_end) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + generate_excess_system( + num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, + i_col_idxs, excess_rhs_ptrs, excess_nz_ptrs, excess_row_ptrs, + excess_col_idxs, excess_values, excess_rhs, e_start, e_end, + item_ct1); + }); +} + +template +void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs, + ValueType* __restrict__ excess_solution, + size_type e_start, size_type e_end, + sycl::nd_item<3> item_ct1) +{ + const auto warp_id = + thread::get_subwarp_id_flat(item_ct1); + auto subwarp = group::tiled_partition( + group::this_thread_block(item_ct1)); + const int local_id = subwarp.thread_rank(); + const auto row = warp_id + e_start; + + if (row >= e_end) { + return; + } + + const IndexType offset = excess_block_ptrs[e_start]; + const IndexType block_begin = excess_block_ptrs[row] - offset; + const IndexType block_end = excess_block_ptrs[row + 1] - offset; + if (block_end == block_begin) { + return; + } + const auto diag = excess_solution[block_end - 1]; + const ValueType scal = one() / std::sqrt(diag); + + for (size_type i = block_begin + local_id; i < block_end; + i += subwarp_size) { + excess_solution[i] *= scal; + } +} + +template +void scale_excess_solution(dim3 grid, dim3 block, + size_type dynamic_shared_memory, sycl::queue* queue, + const IndexType* excess_block_ptrs, + ValueType* excess_solution, size_type e_start, + size_type e_end) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + scale_excess_solution( + excess_block_ptrs, excess_solution, e_start, e_end, item_ct1); + }); +} + +template +void copy_excess_solution(IndexType num_rows, + const IndexType* __restrict__ i_row_ptrs, + const IndexType* __restrict__ excess_rhs_ptrs, + const ValueType* __restrict__ excess_solution, + ValueType* __restrict__ i_values, size_type e_start, + size_type e_end, sycl::nd_item<3> item_ct1) +{ + const auto excess_row = + thread::get_subwarp_id_flat(item_ct1); + const auto row = excess_row + e_start; + + if (row >= e_end) { + return; + } + + auto local_id = item_ct1.get_local_id(2) % subwarp_size; + + const auto i_row_begin = i_row_ptrs[row]; + + const auto excess_begin = excess_rhs_ptrs[row]; + const auto excess_size = excess_rhs_ptrs[row + 1] - excess_begin; + + // if it was handled separately: + if (excess_size > 0) { + // copy the values for this row + for (IndexType nz = local_id; nz < excess_size; nz += subwarp_size) { + i_values[nz + i_row_begin] = + excess_solution[nz + excess_begin - excess_rhs_ptrs[e_start]]; + } + } +} + +template +void copy_excess_solution(dim3 grid, dim3 block, + size_type dynamic_shared_memory, sycl::queue* queue, + IndexType num_rows, const IndexType* i_row_ptrs, + const IndexType* excess_rhs_ptrs, + const ValueType* excess_solution, ValueType* i_values, + size_type e_start, size_type e_end) +{ + queue->parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + copy_excess_solution( + num_rows, i_row_ptrs, excess_rhs_ptrs, excess_solution, + i_values, e_start, e_end, item_ct1); + }); +} + + +} // namespace kernel template void generate_tri_inverse(std::shared_ptr exec, - const matrix::Csr* mtx, - matrix::Csr* inverse_mtx, + const matrix::Csr* input, + matrix::Csr* inverse, IndexType* excess_rhs_ptrs, IndexType* excess_nz_ptrs, - bool lower) GKO_NOT_IMPLEMENTED; + bool lower) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + if (lower) { + kernel::generate_l_inverse( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_row_ptrs(), + inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, + excess_nz_ptrs); + } else { + kernel::generate_u_inverse( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_row_ptrs(), + inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, + excess_nz_ptrs); + } + components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); + components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL); @@ -91,46 +656,87 @@ void generate_general_inverse(std::shared_ptr exec, const matrix::Csr* input, matrix::Csr* inverse, IndexType* excess_rhs_ptrs, - IndexType* excess_nz_ptrs, - bool spd) GKO_NOT_IMPLEMENTED; + IndexType* excess_nz_ptrs, bool spd) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); + kernel::generate_general_inverse( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_row_ptrs(), + inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, + excess_nz_ptrs, spd); + components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); + components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL); template -void generate_excess_system(std::shared_ptr, +void generate_excess_system(std::shared_ptr exec, const matrix::Csr* input, const matrix::Csr* inverse, const IndexType* excess_rhs_ptrs, const IndexType* excess_nz_ptrs, matrix::Csr* excess_system, matrix::Dense* excess_rhs, - size_type e_start, - size_type e_end) GKO_NOT_IMPLEMENTED; + size_type e_start, size_type e_end) +{ + const auto num_rows = input->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + kernel::generate_excess_system( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_const_row_ptrs(), + inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs, + excess_system->get_row_ptrs(), excess_system->get_col_idxs(), + excess_system->get_values(), excess_rhs->get_values(), e_start, e_end); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL); template -void scale_excess_solution(std::shared_ptr, +void scale_excess_solution(std::shared_ptr exec, const IndexType* excess_block_ptrs, matrix::Dense* excess_solution, - size_type e_start, - size_type e_end) GKO_NOT_IMPLEMENTED; + size_type e_start, size_type e_end) +{ + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + kernel::scale_excess_solution( + grid, block, 0, exec->get_queue(), excess_block_ptrs, + excess_solution->get_values(), e_start, e_end); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL); template -void scatter_excess_solution(std::shared_ptr, - const IndexType* excess_block_ptrs, +void scatter_excess_solution(std::shared_ptr exec, + const IndexType* excess_rhs_ptrs, const matrix::Dense* excess_solution, matrix::Csr* inverse, - size_type e_start, - size_type e_end) GKO_NOT_IMPLEMENTED; + size_type e_start, size_type e_end) +{ + const auto num_rows = inverse->get_size()[0]; + + const dim3 block(default_block_size, 1, 1); + const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); + kernel::copy_excess_solution( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + inverse->get_const_row_ptrs(), excess_rhs_ptrs, + excess_solution->get_const_values(), inverse->get_values(), e_start, + e_end); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL); diff --git a/dpcpp/test/CMakeLists.txt b/dpcpp/test/CMakeLists.txt index 8ccd05b0518..7487a520218 100644 --- a/dpcpp/test/CMakeLists.txt +++ b/dpcpp/test/CMakeLists.txt @@ -4,5 +4,6 @@ set(GINKGO_COMPILING_DPCPP_TEST ON) add_subdirectory(base) add_subdirectory(components) add_subdirectory(matrix) +add_subdirectory(preconditioner) add_subdirectory(solver) add_subdirectory(stop) diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt new file mode 100644 index 00000000000..5f5c09e3129 --- /dev/null +++ b/dpcpp/test/preconditioner/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_test(isai_kernels) \ No newline at end of file diff --git a/dpcpp/test/preconditioner/isai_kernels.cpp b/dpcpp/test/preconditioner/isai_kernels.cpp new file mode 100644 index 00000000000..73664274efe --- /dev/null +++ b/dpcpp/test/preconditioner/isai_kernels.cpp @@ -0,0 +1,683 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include "core/preconditioner/isai_kernels.hpp" +#include "core/test/utils.hpp" +#include "dpcpp/base/config.hpp" +#include "matrices/config.hpp" + + +namespace { + + +enum struct matrix_type { lower, upper, general, spd }; + + +class Isai : public ::testing::Test { +protected: +#if GINKGO_DPCPP_SINGLE_MODE + using value_type = float; +#else + using value_type = double; +#endif + using index_type = gko::int32; + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + Isai() : rand_engine(42) {} + + void SetUp() + { + ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0); + ref = gko::ReferenceExecutor::create(); + dpcpp = gko::DpcppExecutor::create(0, ref); + } + + std::unique_ptr clone_allocations(const Csr* csr_mtx) + { + if (csr_mtx->get_executor() != ref) { + return {nullptr}; + } + const auto num_elems = csr_mtx->get_num_stored_elements(); + auto sparsity = csr_mtx->clone(); + + // values are now filled with invalid data to catch potential errors + auto begin_values = sparsity->get_values(); + auto end_values = begin_values + num_elems; + std::fill(begin_values, end_values, -gko::one()); + return sparsity; + } + + void initialize_data(matrix_type type, gko::size_type n, + gko::size_type row_limit) + { + const bool for_lower_tm = type == matrix_type::lower; + auto nz_dist = std::uniform_int_distribution(1, row_limit); + auto val_dist = std::uniform_real_distribution(-1., 1.); + mtx = Csr::create(ref); + if (type == matrix_type::general) { + auto dense_mtx = gko::test::generate_random_matrix( + n, n, nz_dist, val_dist, rand_engine, ref, gko::dim<2>{n, n}); + ensure_diagonal(dense_mtx.get()); + mtx->copy_from(dense_mtx.get()); + } else if (type == matrix_type::spd) { + auto dense_mtx = gko::test::generate_random_band_matrix( + n, row_limit / 4, row_limit / 4, val_dist, rand_engine, ref, + gko::dim<2>{n, n}); + auto transp = gko::as(dense_mtx->transpose()); + auto spd_mtx = Dense::create(ref, gko::dim<2>{n, n}); + dense_mtx->apply(transp.get(), spd_mtx.get()); + mtx->copy_from(spd_mtx.get()); + } else { + mtx = gko::test::generate_random_triangular_matrix( + n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref, + gko::dim<2>{n, n}); + } + inverse = clone_allocations(mtx.get()); + + d_mtx = gko::clone(dpcpp, mtx); + d_inverse = gko::clone(dpcpp, inverse); + } + + template + std::unique_ptr read(const char* name) + { + std::ifstream mtxstream{std::string{gko::matrices::location_isai_mtxs} + + name}; + auto result = gko::read(mtxstream, ref); + // to avoid removing 0s, the matrices store 12345 instead + for (gko::size_type i = 0; i < result->get_num_stored_elements(); ++i) { + auto& val = result->get_values()[i]; + if (val == static_cast(12345.0)) { + val = 0; + } + } + return std::move(result); + } + + void ensure_diagonal(Dense* mtx) + { + for (int i = 0; i < mtx->get_size()[0]; ++i) { + mtx->at(i, i) = gko::one(); + } + } + + + std::shared_ptr ref; + std::shared_ptr dpcpp; + + std::default_random_engine rand_engine; + + std::unique_ptr mtx; + std::unique_ptr inverse; + + std::unique_ptr d_mtx; + std::unique_ptr d_inverse; +}; + + +TEST_F(Isai, DpcppIsaiGenerateLinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 536, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::dpcpp::isai::generate_tri_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateUinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 615, 31); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::dpcpp::isai::generate_tri_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateAinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::general, 615, 15); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::dpcpp::isai::generate_general_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateSpdinverseShortIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 15); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::dpcpp::isai::generate_general_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 15 * r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_EQ(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 554, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::kernels::dpcpp::isai::generate_tri_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + true); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 695, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::dpcpp::isai::generate_tri_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateAinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::general, 695, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::dpcpp::isai::generate_general_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateSpdinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::Array da1(dpcpp, num_rows + 1); + auto da2 = da1; + + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::kernels::dpcpp::isai::generate_general_inverse( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(), + false); + + GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse); + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r::value); + GKO_ASSERT_ARRAY_EQ(a1, da1); + GKO_ASSERT_ARRAY_EQ(a2, da2); + ASSERT_GT(a1.get_const_data()[num_rows], 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateExcessLinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 518, 40); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + gko::Array da2(dpcpp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get(), 0, num_rows); + gko::kernels::dpcpp::isai::generate_excess_system( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateExcessUinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 673, 51); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + gko::Array da2(dpcpp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get(), 0, num_rows); + gko::kernels::dpcpp::isai::generate_excess_system( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateExcessAinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::general, 100, 51); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + gko::Array da2(dpcpp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get(), 0, num_rows); + gko::kernels::dpcpp::isai::generate_excess_system( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiGenerateExcessSpdinverseLongIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + gko::Array da2(dpcpp, a2); + auto e_dim = a1.get_data()[num_rows]; + auto e_nnz = a2.get_data()[num_rows]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get(), 0, num_rows); + gko::kernels::dpcpp::isai::generate_excess_system( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiGeneratePartialExcessIsEquivalentToRef) +{ + initialize_data(matrix_type::general, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + gko::Array da2(dpcpp, a2); + auto e_dim = a1.get_data()[10] - a1.get_data()[5]; + auto e_nnz = a2.get_data()[10] - a2.get_data()[5]; + auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz); + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz); + auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1)); + + gko::kernels::reference::isai::generate_excess_system( + ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(), + excess.get(), e_rhs.get(), 5u, 10u); + gko::kernels::dpcpp::isai::generate_excess_system( + dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(), + da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u); + + GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess); + GKO_ASSERT_MTX_NEAR(excess, dexcess, 0); + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiScaleExcessSolutionIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scale_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), 0, num_rows); + gko::kernels::dpcpp::isai::scale_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), 0, num_rows); + + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); +} + + +TEST_F(Isai, DpcppIsaiScalePartialExcessSolutionIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[10] - a1.get_data()[5]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + + gko::kernels::reference::isai::scale_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), 5u, 10u); + gko::kernels::dpcpp::isai::scale_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), 5u, 10u); + + GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0); +} + + +TEST_F(Isai, DpcppIsaiScatterExcessSolutionLIsEquivalentToRef) +{ + initialize_data(matrix_type::lower, 572, 52); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); + gko::kernels::dpcpp::isai::scatter_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, + num_rows); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiScatterExcessSolutionUIsEquivalentToRef) +{ + initialize_data(matrix_type::upper, 702, 45); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_tri_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); + gko::kernels::dpcpp::isai::scatter_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, + num_rows); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiScatterExcessSolutionAIsEquivalentToRef) +{ + initialize_data(matrix_type::general, 702, 45); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); + gko::kernels::dpcpp::isai::scatter_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, + num_rows); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiScatterExcessSolutionSpdIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[num_rows]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows); + gko::kernels::dpcpp::isai::scatter_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0, + num_rows); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +TEST_F(Isai, DpcppIsaiScatterPartialExcessSolutionIsEquivalentToRef) +{ + initialize_data(matrix_type::spd, 100, 64); + const auto num_rows = mtx->get_size()[0]; + gko::Array a1(ref, num_rows + 1); + auto a2 = a1; + gko::kernels::reference::isai::generate_general_inverse( + ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true); + gko::Array da1(dpcpp, a1); + auto e_dim = a1.get_data()[10] - a1.get_data()[5]; + auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1)); + std::fill_n(e_rhs->get_values(), e_dim, 123456); + auto de_rhs = gko::clone(dpcpp, e_rhs); + // overwrite -1 values with inverse + d_inverse->copy_from(lend(inverse)); + + gko::kernels::reference::isai::scatter_excess_solution( + ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u); + gko::kernels::dpcpp::isai::scatter_excess_solution( + dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u); + + GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0); + ASSERT_GT(e_dim, 0); +} + + +} // namespace diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp index 77090304654..31d7140692d 100644 --- a/hip/preconditioner/isai_kernels.hip.cpp +++ b/hip/preconditioner/isai_kernels.hip.cpp @@ -46,11 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" #include "hip/base/math.hip.hpp" #include "hip/base/types.hip.hpp" +#include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" namespace gko { @@ -69,8 +71,6 @@ constexpr int subwarps_per_block{2}; constexpr int default_block_size{subwarps_per_block * subwarp_size}; -#include "common/cuda_hip/components/atomic.hpp.inc" -#include "common/cuda_hip/components/warp_blas.hpp.inc" #include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc" From 26e391f3cb56bc036ff25784627d76d75c11a258 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 9 Feb 2022 15:31:34 +0100 Subject: [PATCH 2/6] use std for limit, fix ffs, kernel guard --- dev_tools/oneapi/convert_source.sh | 2 + dpcpp/base/math.hpp | 61 -------------- dpcpp/components/intrinsics.dp.hpp | 12 ++- dpcpp/components/merging.dp.hpp | 5 +- dpcpp/preconditioner/isai_kernels.dp.cpp | 101 +++++++++++++---------- 5 files changed, 70 insertions(+), 111 deletions(-) delete mode 100644 dpcpp/base/math.hpp diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh index 209b9d7e8e6..fe55181a52f 100755 --- a/dev_tools/oneapi/convert_source.sh +++ b/dev_tools/oneapi/convert_source.sh @@ -301,6 +301,8 @@ replace_regex="${replace_regex};s|trick/thread_ids.hpp|dpcpp/components/thread_i replace_regex="${replace_regex};s|trick/cooperative_groups\.hpp|dpcpp/components/cooperative_groups.dp.hpp|g" replace_regex="${replace_regex};s|trick/sorting\.hpp|dpcpp/components/sorting.dp.hpp|g" replace_regex="${replace_regex};s|trick/reduction\.hpp|dpcpp/components/reduction.dp.hpp|g" +replace_regex="${replace_regex};s|cuda/base/math\.hpp|limits|g" +replace_regex="${replace_regex};s|device_numeric_limits(<.*>)::(.*)(,|;)|std::numeric_limits\1::\2()\3|g" replace_regex="${replace_regex};s/#define GET_QUEUE 0//g" replace_regex="${replace_regex};s/GET_QUEUE/exec->get_queue()/g" replace_regex="${replace_regex};s/cuda/dpcpp/g" diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp deleted file mode 100644 index a9fa1a38a39..00000000000 --- a/dpcpp/base/math.hpp +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_DPCPP_BASE_MATH_HPP_ -#define GKO_DPCPP_BASE_MATH_HPP_ - - -#include - - -#include -#include - - -namespace gko { -namespace kernels { -namespace dpcpp { - - -template -struct device_numeric_limits { - static constexpr auto inf = std::numeric_limits::infinity(); - static constexpr auto max = std::numeric_limits::max(); - static constexpr auto min = std::numeric_limits::min(); -}; - - -} // namespace dpcpp -} // namespace kernels -} // namespace gko - -#endif // GKO_DPCPP_BASE_MATH_HPP_ diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp index c0e5a445120..2466666b37d 100644 --- a/dpcpp/components/intrinsics.dp.hpp +++ b/dpcpp/components/intrinsics.dp.hpp @@ -62,11 +62,19 @@ __dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); } * @internal * Returns the (1-based!) index of the first set bit in the given mask, * starting from the least significant bit. + * + * @note can not use length - clz because subgroup size is not always 32 or 64 */ -__dpct_inline__ int ffs(uint32 mask) { return 32 - sycl::clz(mask); } +__dpct_inline__ int ffs(uint32 mask) +{ + return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1); +} /** @copydoc ffs */ -__dpct_inline__ int ffs(uint64 mask) { return 64 - sycl::clz(mask); } +__dpct_inline__ int ffs(uint64 mask) +{ + return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1); +} /** diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp index 67ee87f1b60..5303b11ea0f 100644 --- a/dpcpp/components/merging.dp.hpp +++ b/dpcpp/components/merging.dp.hpp @@ -42,7 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/utils.hpp" #include "dpcpp/base/dpct.hpp" -#include "dpcpp/base/math.hpp" #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/searching.dp.hpp" @@ -165,7 +164,7 @@ __dpct_inline__ void group_merge(const ValueType* __restrict__ a, IndexType a_begin{}; IndexType b_begin{}; auto lane = static_cast(group.thread_rank()); - auto sentinel = device_numeric_limits::max; + auto sentinel = std::numeric_limits::max(); auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel); auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) { @@ -275,7 +274,7 @@ __dpct_inline__ void sequential_merge(const ValueType* __restrict__ a, auto c_size = a_size + b_size; IndexType a_begin{}; IndexType b_begin{}; - auto sentinel = device_numeric_limits::max; + auto sentinel = std::numeric_limits::max(); auto a_cur = checked_load(a, a_begin, a_size, sentinel); auto b_cur = checked_load(b, b_begin, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin++) { diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index ef4a64c59e2..a6548d77aab 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -626,22 +626,24 @@ void generate_tri_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - if (lower) { - kernel::generate_l_inverse( - grid, block, 0, exec->get_queue(), static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - input->get_const_values(), inverse->get_row_ptrs(), - inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, - excess_nz_ptrs); - } else { - kernel::generate_u_inverse( - grid, block, 0, exec->get_queue(), static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - input->get_const_values(), inverse->get_row_ptrs(), - inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, - excess_nz_ptrs); + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); + if (grid > 0) { + if (lower) { + kernel::generate_l_inverse( + grid, block, 0, exec->get_queue(), + static_cast(num_rows), input->get_const_row_ptrs(), + input->get_const_col_idxs(), input->get_const_values(), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs); + } else { + kernel::generate_u_inverse( + grid, block, 0, exec->get_queue(), + static_cast(num_rows), input->get_const_row_ptrs(), + input->get_const_col_idxs(), input->get_const_values(), + inverse->get_row_ptrs(), inverse->get_col_idxs(), + inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs); + } } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); @@ -660,14 +662,16 @@ void generate_general_inverse(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1); - kernel::generate_general_inverse( - grid, block, 0, exec->get_queue(), static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - input->get_const_values(), inverse->get_row_ptrs(), - inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, - excess_nz_ptrs, spd); + const auto block = default_block_size; + const auto grid = ceildiv(num_rows, block / subwarp_size); + if (grid > 0) { + kernel::generate_general_inverse( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_row_ptrs(), + inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs, + excess_nz_ptrs, spd); + } components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1); components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1); } @@ -688,15 +692,18 @@ void generate_excess_system(std::shared_ptr exec, { const auto num_rows = input->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::generate_excess_system( - grid, block, 0, exec->get_queue(), static_cast(num_rows), - input->get_const_row_ptrs(), input->get_const_col_idxs(), - input->get_const_values(), inverse->get_const_row_ptrs(), - inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs, - excess_system->get_row_ptrs(), excess_system->get_col_idxs(), - excess_system->get_values(), excess_rhs->get_values(), e_start, e_end); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { + kernel::generate_excess_system( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + input->get_const_row_ptrs(), input->get_const_col_idxs(), + input->get_const_values(), inverse->get_const_row_ptrs(), + inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs, + excess_system->get_row_ptrs(), excess_system->get_col_idxs(), + excess_system->get_values(), excess_rhs->get_values(), e_start, + e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -709,11 +716,13 @@ void scale_excess_solution(std::shared_ptr exec, matrix::Dense* excess_solution, size_type e_start, size_type e_end) { - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::scale_excess_solution( - grid, block, 0, exec->get_queue(), excess_block_ptrs, - excess_solution->get_values(), e_start, e_end); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { + kernel::scale_excess_solution( + grid, block, 0, exec->get_queue(), excess_block_ptrs, + excess_solution->get_values(), e_start, e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -729,13 +738,15 @@ void scatter_excess_solution(std::shared_ptr exec, { const auto num_rows = inverse->get_size()[0]; - const dim3 block(default_block_size, 1, 1); - const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1); - kernel::copy_excess_solution( - grid, block, 0, exec->get_queue(), static_cast(num_rows), - inverse->get_const_row_ptrs(), excess_rhs_ptrs, - excess_solution->get_const_values(), inverse->get_values(), e_start, - e_end); + const auto block = default_block_size; + const auto grid = ceildiv(e_end - e_start, block / subwarp_size); + if (grid > 0) { + kernel::copy_excess_solution( + grid, block, 0, exec->get_queue(), static_cast(num_rows), + inverse->get_const_row_ptrs(), excess_rhs_ptrs, + excess_solution->get_const_values(), inverse->get_values(), e_start, + e_end); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( From 69ec659d6ee54f15d6847083ab715b4ac62e8808 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 7 Mar 2022 14:55:21 +0100 Subject: [PATCH 3/6] sycl 2020 - reqd subgroup in parallel_for --- dpcpp/preconditioner/isai_kernels.dp.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index a6548d77aab..1b6c33e4500 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -262,7 +262,9 @@ void generate_l_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subwarp_size)]] { generate_l_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, @@ -330,7 +332,9 @@ void generate_u_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory, storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subwarp_size)]] { generate_u_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, @@ -410,7 +414,9 @@ void generate_general_inverse( storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subwarp_size)]] { generate_general_inverse( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, spd, @@ -509,7 +515,8 @@ void generate_excess_system( size_type e_end) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { generate_excess_system( num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs, excess_rhs_ptrs, excess_nz_ptrs, excess_row_ptrs, @@ -558,7 +565,8 @@ void scale_excess_solution(dim3 grid, dim3 block, size_type e_end) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { scale_excess_solution( excess_block_ptrs, excess_solution, e_start, e_end, item_ct1); }); @@ -606,7 +614,8 @@ void copy_excess_solution(dim3 grid, dim3 block, size_type e_start, size_type e_end) { queue->parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] { copy_excess_solution( num_rows, i_row_ptrs, excess_rhs_ptrs, excess_solution, i_values, e_start, e_end, item_ct1); From c20b44d38c2d321b20c1eccd6a70f4bdeebc7b03 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 7 Mar 2022 14:59:14 +0100 Subject: [PATCH 4/6] update documentation Co-authored-by: Fritz Goebel --- common/cuda_hip/components/searching.hpp.inc | 24 ++++++++++---------- dpcpp/components/searching.dp.hpp | 24 ++++++++++---------- dpcpp/preconditioner/isai_kernels.dp.cpp | 2 -- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp.inc index 37c41c8aab6..acc06a5d5a4 100644 --- a/common/cuda_hip/components/searching.hpp.inc +++ b/common/cuda_hip/components/searching.hpp.inc @@ -66,12 +66,12 @@ __forceinline__ __device__ IndexType binary_search(IndexType offset, * @internal * Generic implementation of a fixed-size binary search. * The implementation makes sure that the number of predicate evaluations only - * depends on `length` and not on the actual position of the partition point. - * It assumes that the predicate partitions the range [offset, offset + length) - * into two subranges [offset, middle), [middle, offset + length) such that - * the predicate is `false` for all elements in the first range and `true` for - * all elements in the second range. `middle` is called the partition point. - * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * depends on `size` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [0, size) into two + * subranges [0, middle), [middle, size) such that the predicate is `false` for + * all elements in the first range and `true` for all elements in the second + * range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `size`. * * @tparam size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have @@ -103,12 +103,12 @@ __forceinline__ __device__ int synchronous_fixed_binary_search(Predicate p) * @internal * Generic implementation of a synchronous binary search. * The implementation makes sure that the number of predicate evaluations only - * depends on `length` and not on the actual position of the partition point. - * It assumes that the predicate partitions the range [offset, offset + length) - * into two subranges [offset, middle), [middle, offset + length) such that - * the predicate is `false` for all elements in the first range and `true` for - * all elements in the second range. `middle` is called the partition point. - * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * depends on `size` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [0, size) into two + * subranges [0, middle), [middle, size) such that the predicate is `false` for + * all elements in the first range and `true` for all elements in the second + * range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `size`. * * @param size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp index 4db6f1f7073..8a28579f6c5 100644 --- a/dpcpp/components/searching.dp.hpp +++ b/dpcpp/components/searching.dp.hpp @@ -82,12 +82,12 @@ __dpct_inline__ IndexType binary_search(IndexType offset, IndexType length, * @internal * Generic implementation of a fixed-size binary search. * The implementation makes sure that the number of predicate evaluations only - * depends on `length` and not on the actual position of the partition point. - * It assumes that the predicate partitions the range [offset, offset + length) - * into two subranges [offset, middle), [middle, offset + length) such that - * the predicate is `false` for all elements in the first range and `true` for - * all elements in the second range. `middle` is called the partition point. - * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * depends on `size` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [0, size) into two + * subranges [0, middle), [middle, size) such that the predicate is `false` for + * all elements in the first range and `true` for all elements in the second + * range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `size`. * * @tparam size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have @@ -119,12 +119,12 @@ __dpct_inline__ int synchronous_fixed_binary_search(Predicate p) * @internal * Generic implementation of a synchronous binary search. * The implementation makes sure that the number of predicate evaluations only - * depends on `length` and not on the actual position of the partition point. - * It assumes that the predicate partitions the range [offset, offset + length) - * into two subranges [offset, middle), [middle, offset + length) such that - * the predicate is `false` for all elements in the first range and `true` for - * all elements in the second range. `middle` is called the partition point. - * If the predicate is `false` everywhere, `middle` equals `offset + length`. + * depends on `size` and not on the actual position of the partition point. + * It assumes that the predicate partitions the range [0, size) into two + * subranges [0, middle), [middle, size) such that the predicate is `false` for + * all elements in the first range and `true` for all elements in the second + * range. `middle` is called the partition point. + * If the predicate is `false` everywhere, `middle` equals `size`. * * @param size the length of the partitioned range - must be a power of two * @param p the predicate to be evaluated on the range - it should not have diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index 1b6c33e4500..be16e3f7070 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -139,8 +139,6 @@ __dpct_inline__ void generic_generate( } // subwarp_size^2 storage per subwarp - - auto dense_system_ptr = *storage + (item_ct1.get_local_id(2) / subwarp_size) * subwarp_size * subwarp_size; From 3cd455066d3b0a2d99562c56dba27d5c790218b0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 7 Mar 2022 17:15:04 +0100 Subject: [PATCH 5/6] cygwin -j2 for out of memory allocation issue --- .github/workflows/windows-cygwin.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows-cygwin.yml b/.github/workflows/windows-cygwin.yml index 43b87414cf3..16c48aaf425 100644 --- a/.github/workflows/windows-cygwin.yml +++ b/.github/workflows/windows-cygwin.yml @@ -53,7 +53,7 @@ jobs: mkdir build cd build bash -c "cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .." - bash -c "make -j4" + bash -c "make -j2" bash -c "ctest . --output-on-failure" shell: cmd From a5b28d7732529cb107f2c7194ff8b7854f9e5716 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 8 Mar 2022 09:05:09 +0000 Subject: [PATCH 6/6] Format files Co-authored-by: Yuhsiang M. Tsai --- dpcpp/components/intrinsics.dp.hpp | 2 +- dpcpp/components/merging.dp.hpp | 2 +- dpcpp/components/searching.dp.hpp | 2 +- dpcpp/components/warp_blas.dp.hpp | 2 +- dpcpp/test/preconditioner/isai_kernels.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp index 2466666b37d..61a26e7555b 100644 --- a/dpcpp/components/intrinsics.dp.hpp +++ b/dpcpp/components/intrinsics.dp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp index 5303b11ea0f..3b20b6fd5c6 100644 --- a/dpcpp/components/merging.dp.hpp +++ b/dpcpp/components/merging.dp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp index 8a28579f6c5..df65f8ae93c 100644 --- a/dpcpp/components/searching.dp.hpp +++ b/dpcpp/components/searching.dp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp index 7754cfb260d..b31fd0ef4fd 100644 --- a/dpcpp/components/warp_blas.dp.hpp +++ b/dpcpp/components/warp_blas.dp.hpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/dpcpp/test/preconditioner/isai_kernels.cpp b/dpcpp/test/preconditioner/isai_kernels.cpp index 73664274efe..fe166c13f59 100644 --- a/dpcpp/test/preconditioner/isai_kernels.cpp +++ b/dpcpp/test/preconditioner/isai_kernels.cpp @@ -1,5 +1,5 @@ /************************************************************* -Copyright (c) 2017-2021, the Ginkgo authors +Copyright (c) 2017-2022, the Ginkgo authors All rights reserved. Redistribution and use in source and binary forms, with or without