From 5e7a858a0e1573e16b3b8fecf4fa54339b4718a5 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 24 Nov 2021 15:33:50 +0800
Subject: [PATCH 1/6] dpcpp port isai (row_limit=32)

---
 cuda/preconditioner/isai_kernels.cu        |   3 +-
 dev_tools/oneapi/convert_source.sh         |   3 +-
 dpcpp/base/math.hpp                        |  61 ++
 dpcpp/components/intrinsics.dp.hpp         |  88 +++
 dpcpp/components/merging.dp.hpp            | 337 ++++++++++
 dpcpp/components/reduction.dp.hpp          |   4 +-
 dpcpp/components/searching.dp.hpp          | 260 ++++++++
 dpcpp/components/warp_blas.dp.hpp          | 474 ++++++++++++++
 dpcpp/preconditioner/isai_kernels.dp.cpp   | 670 +++++++++++++++++++-
 dpcpp/test/CMakeLists.txt                  |   1 +
 dpcpp/test/preconditioner/CMakeLists.txt   |   1 +
 dpcpp/test/preconditioner/isai_kernels.cpp | 683 +++++++++++++++++++++
 hip/preconditioner/isai_kernels.hip.cpp    |   4 +-
 13 files changed, 2550 insertions(+), 39 deletions(-)
 create mode 100644 dpcpp/base/math.hpp
 create mode 100644 dpcpp/components/intrinsics.dp.hpp
 create mode 100644 dpcpp/components/merging.dp.hpp
 create mode 100644 dpcpp/components/searching.dp.hpp
 create mode 100644 dpcpp/components/warp_blas.dp.hpp
 create mode 100644 dpcpp/test/preconditioner/CMakeLists.txt
 create mode 100644 dpcpp/test/preconditioner/isai_kernels.cpp
diff --git a/cuda/preconditioner/isai_kernels.cu b/cuda/preconditioner/isai_kernels.cu
index 3cc0bebf123..96808c8bf03 100644
--- a/cuda/preconditioner/isai_kernels.cu
+++ b/cuda/preconditioner/isai_kernels.cu
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
+#include "cuda/components/warp_blas.cuh"
 
 
 namespace gko {
@@ -66,8 +67,6 @@ constexpr int subwarps_per_block{2};
 constexpr int default_block_size{subwarps_per_block * subwarp_size};
 
 
-#include "common/cuda_hip/components/atomic.hpp.inc"
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
 #include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
 
 
diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh
index b0ed62252f7..209b9d7e8e6 100755
--- a/dev_tools/oneapi/convert_source.sh
+++ b/dev_tools/oneapi/convert_source.sh
@@ -16,8 +16,9 @@
 #                    If GTEST_HEADER_DIR is available elsewhere GINKGO_BUILD_TESTS is not required.
 #                    CMake's step is not required if copying the ginkgo config.hpp from another ginkgo build into "${ROOT_DIR}/include/ginkgo/".
 #   ROOT_BUILD_DIR: the complete path for build folder. The default is "${ROOT_DIR}/${BUILD_DIR}"
-#   GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/third_party_gtest/src/googletest/include"
+#   GTEST_HEADER_DIR: the gtest header folder. The default is "${ROOT_BUILD_DIR}/_deps/googletest-src/googletest/include"
 #   CLANG_FORMAT: the clang-format exec. The default is "clang-format"
+#   VERBOSE: if it is set as 1, script will ouput the path information
 CURRENT_DIR="$( pwd )"
 cd "$( dirname "${BASH_SOURCE[0]}" )"
 SCRIPT_DIR="$( pwd )"
diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp
new file mode 100644
index 00000000000..a9fa1a38a39
--- /dev/null
+++ b/dpcpp/base/math.hpp
@@ -0,0 +1,61 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_BASE_MATH_HPP_
+#define GKO_DPCPP_BASE_MATH_HPP_
+
+
+#include <ginkgo/core/base/math.hpp>
+
+
+#include <complex>
+#include <limits>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+template <typename T>
+struct device_numeric_limits {
+    static constexpr auto inf = std::numeric_limits<T>::infinity();
+    static constexpr auto max = std::numeric_limits<T>::max();
+    static constexpr auto min = std::numeric_limits<T>::min();
+};
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_DPCPP_BASE_MATH_HPP_
diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp
new file mode 100644
index 00000000000..c0e5a445120
--- /dev/null
+++ b/dpcpp/components/intrinsics.dp.hpp
@@ -0,0 +1,88 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+#include "dpcpp/base/dpct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/**
+ * @internal
+ * Returns the number of set bits in the given mask.
+ */
+__dpct_inline__ int popcnt(uint32 mask) { return sycl::popcount(mask); }
+
+/** @copydoc popcnt */
+__dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); }
+
+
+/**
+ * @internal
+ * Returns the (1-based!) index of the first set bit in the given mask,
+ * starting from the least significant bit.
+ */
+__dpct_inline__ int ffs(uint32 mask) { return 32 - sycl::clz(mask); }
+
+/** @copydoc ffs */
+__dpct_inline__ int ffs(uint64 mask) { return 64 - sycl::clz(mask); }
+
+
+/**
+ * @internal
+ * Returns the number of zero bits before the first set bit in the given mask,
+ * starting from the most significant bit.
+ */
+__dpct_inline__ int clz(uint32 mask) { return sycl::clz(mask); }
+
+/** @copydoc clz */
+__dpct_inline__ int clz(uint64 mask) { return sycl::clz(mask); }
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_INTRINSICS_DP_HPP_
diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp
new file mode 100644
index 00000000000..67ee87f1b60
--- /dev/null
+++ b/dpcpp/components/merging.dp.hpp
@@ -0,0 +1,337 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_
+
+
+#include <limits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include "core/base/utils.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/base/math.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+#include "dpcpp/components/searching.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace detail {
+
+
+/**
+ * @internal
+ * The result from the @ref group_merge_step function.
+ */
+template <typename ValueType>
+struct merge_result {
+    /** The element of a being merged in the current thread. */
+    ValueType a_val;
+    /** The element of b being merged in the current thread. */
+    ValueType b_val;
+    /** The index from a that is being merged in the current thread. */
+    int a_idx;
+    /** The index from b that is being merged in the current thread. */
+    int b_idx;
+    /** The number of elements from a that have been merged in total. */
+    int a_advance;
+    /** The number of elements from b that have been merged in total. */
+    int b_advance;
+};
+
+}  // namespace detail
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges the first `warp_size` elements from
+ * two ranges, where each warp stores a single element from each range.
+ * It assumes that the elements are sorted in ascending order, i.e. for i < j,
+ * the value of `a` at thread i is smaller or equal to the value at thread j,
+ * and the same holds for `b`.
+ *
+ * This implementation is based on ideas from  Green et al.,
+ * "GPU merge path: a GPU merging algorithm", but uses random-access warp
+ * shuffles instead of shared-memory to exchange values of a and b.
+ *
+ * @param a      the element from the first range
+ * @param b      the element from the second range
+ * @param size   the number of elements in the output range
+ * @param group  the cooperative group that executes the merge
+ * @return  a structure containing the merge result distributed over the group.
+ */
+template <int group_size, typename ValueType, typename Group>
+__dpct_inline__ detail::merge_result<ValueType> group_merge_step(ValueType a,
+                                                                 ValueType b,
+                                                                 Group group)
+{
+    // thread i takes care of ith element of the merged sequence
+    auto i = int(group.thread_rank());
+
+    // we want to find the smallest index `x` such that a[x] >= b[i - x - 1]
+    // or `i` if no such index exists
+    //
+    // if x = i then c[0...i - 1] = a[0...i - 1]
+    //     => merge a[i] with b[0]
+    // if x = 0 then c[0...i - 1] = b[0...i - 1]
+    //     => merge a[0] with b[i]
+    // otherwise c[0...i - 1] contains a[0...x - 1] and b[0...i - x - 1]
+    //   because the minimality of `x` implies
+    //   b[i - x] >= a[x - 1]
+    //   and a[x] >= a[0...x - 1], b[0...i - x - 1]
+    //     => merge a[x] with b[i - x]
+    auto minx = synchronous_fixed_binary_search<group_size>([&](int x) {
+        auto a_remote = group.shfl(a, x);
+        auto b_remote = group.shfl(b, max(i - x - 1, 0));
+        return a_remote >= b_remote || x >= i;
+    });
+
+    auto a_idx = minx;
+    auto b_idx = max(i - minx, 0);
+    auto a_val = group.shfl(a, a_idx);
+    auto b_val = group.shfl(b, b_idx);
+    auto cmp = a_val < b_val;
+    auto a_advance = popcnt(group.ballot(cmp));
+    auto b_advance = int(group.size()) - a_advance;
+
+    return {a_val, b_val, a_idx, b_idx, a_advance, b_advance};
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fn` will be called for each merged element.
+ *
+ * @param a       the first range
+ * @param a_size  the size of the first range
+ * @param b       the second range
+ * @param b_size  the size of the second range
+ * @param group   the group that executes the merge
+ * @param merge_fn  the callback that is being called for each merged element.
+ *                  It takes six parameters:
+ *                  `IndexType a_idx, ValueType a_val, IndexType b_idx,
+ *                   ValueType b_val, IndexType c_index, bool valid`.
+ *                  `*_val` and `*_idx` are the values resp. the indices of the
+ *                  values from a/b being compared at output index `c_index`.
+ *                  `valid` specifies if the current thread has to merge an
+ *                  element (this is necessary for shfl and ballot operations).
+ *                  It must return `false` on all threads of the group iff the
+ *                  merge shouldn't be continued.
+ */
+template <int group_size, typename ValueType, typename IndexType,
+          typename Group, typename Callback>
+__dpct_inline__ void group_merge(const ValueType* __restrict__ a,
+                                 IndexType a_size,
+                                 const ValueType* __restrict__ b,
+                                 IndexType b_size, Group group,
+                                 Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    auto lane = static_cast<IndexType>(group.thread_rank());
+    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
+    auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
+    for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
+        auto merge_result = group_merge_step<group_size>(a_cur, b_cur, group);
+        auto valid = c_begin + lane < c_size;
+        auto cont = merge_fn(merge_result.a_idx + a_begin, merge_result.a_val,
+                             merge_result.b_idx + b_begin, merge_result.b_val,
+                             c_begin + lane, valid);
+        if (!group.any(cont && valid)) {
+            break;
+        }
+        auto a_advance = merge_result.a_advance;
+        auto b_advance = merge_result.b_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+
+        // shuffle the unmerged elements to the front
+        a_cur = group.shfl_down(a_cur, a_advance);
+        b_cur = group.shfl_down(b_cur, b_advance);
+        /*
+         * To optimize memory access, we load the new elements for `a` and `b`
+         * with a single load instruction:
+         * the lower part of the group loads new elements for `a`
+         * the upper part of the group loads new elements for `b`
+         * `load_lane` is the part-local lane idx
+         * The elements for `a` have to be shuffled up afterwards.
+         */
+        auto load_a = lane < a_advance;
+        auto load_lane = load_a ? lane : lane - a_advance;
+        auto load_source = load_a ? a : b;
+        auto load_begin = load_a ? a_begin + b_advance : b_begin + a_advance;
+        auto load_size = load_a ? a_size : b_size;
+
+        auto load_idx = load_begin + load_lane;
+        auto loaded = checked_load(load_source, load_idx, load_size, sentinel);
+        // shuffle the `a` values to the end of the warp
+        auto lower_loaded = group.shfl_up(loaded, b_advance);
+        a_cur = lane < b_advance ? a_cur : lower_loaded;
+        b_cur = lane < a_advance ? b_cur : loaded;
+    }
+}
+
+
+/**
+ * @internal
+ * Warp-parallel merge algorithm that reports matching elements from two sorted
+ * ranges of arbitrary size. `merge_fn` will be called for each pair of matching
+ * element.
+ *
+ * @param a       the first range
+ * @param a_size  the size of the first range
+ * @param b       the second range
+ * @param b_size  the size of the second range
+ * @param group   the group that executes the merge
+ * @param match_fn  the callback that is being called for each matching pair.
+ *                  It takes five parameters:
+ *                  `ValueType val, IndexType a_idx, IndexType b_idx,
+ *                   lane_mask_type match_mask, bool valid`.
+ *                  `val` is the matching element, `*_idx` are the indices of
+ *                  the matching values from a and b, match_mask is a lane mask
+ *                  that is 1 for every subwarp lane that found a match.
+ *                  `valid` is true iff there is actually a match.
+ *                  (necessary for warp-synchronous operations)
+ */
+template <int group_size, typename IndexType, typename ValueType,
+          typename Group, typename Callback>
+__dpct_inline__ void group_match(const ValueType* __restrict__ a,
+                                 IndexType a_size,
+                                 const ValueType* __restrict__ b,
+                                 IndexType b_size, Group group,
+                                 Callback match_fn)
+{
+    group_merge<group_size>(
+        a, a_size, b, b_size, group,
+        [&](IndexType a_idx, ValueType a_val, IndexType b_idx, ValueType b_val,
+            IndexType, bool valid) {
+            auto matchmask = group.ballot(a_val == b_val && valid);
+            match_fn(a_val, a_idx, b_idx, matchmask, a_val == b_val && valid);
+            return a_idx < a_size && b_idx < b_size;
+        });
+}
+
+
+/**
+ * @internal
+ * Sequential merge algorithm that merges two sorted ranges of arbitrary
+ * size. `merge_fn` will be called for each merged element.
+ *
+ * @param a  the first range
+ * @param a_size the size of the first range
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param merge_fn  the callback that will be called for each merge step.
+ *                  It takes five parameters:
+ *                  `IndexType a_idx, ValueType a_val,
+ *                   IndexType b_idx, ValueType b_val, IndexType c_idx`.
+ *                  `*_val` and `*_idx` are the values resp. the indices of
+ *                  the values from a/b being compared in step `c_idx`.
+ *                  It must return `false` iff the merge should stop.
+ */
+template <typename ValueType, typename IndexType, typename Callback>
+__dpct_inline__ void sequential_merge(const ValueType* __restrict__ a,
+                                      IndexType a_size,
+                                      const ValueType* __restrict__ b,
+                                      IndexType b_size, Callback merge_fn)
+{
+    auto c_size = a_size + b_size;
+    IndexType a_begin{};
+    IndexType b_begin{};
+    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto a_cur = checked_load(a, a_begin, a_size, sentinel);
+    auto b_cur = checked_load(b, b_begin, b_size, sentinel);
+    for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
+        auto cont = merge_fn(a_begin, a_cur, b_begin, b_cur, c_begin);
+        if (!cont) {
+            break;
+        }
+        auto a_advance = a_cur < b_cur;
+        auto b_advance = !a_advance;
+        a_begin += a_advance;
+        b_begin += b_advance;
+
+        auto load = a_advance ? a : b;
+        auto load_size = a_advance ? a_size : b_size;
+        auto load_idx = a_advance ? a_begin : b_begin;
+        auto loaded = checked_load(load, load_idx, load_size, sentinel);
+        a_cur = a_advance ? loaded : a_cur;
+        b_cur = b_advance ? loaded : b_cur;
+    }
+}
+
+
+/**
+ * @internal
+ * Sequential algorithm that finds matching elements in two sorted ranges of
+ * arbitrary size. `merge_fn` will be called for each pair of matching
+ * elements.
+ *
+ * @param a  the first range
+ * @param a_size the size of the first range
+ * @param b  the second range
+ * @param b_size the size of the second range
+ * @param match_fn  the callback that is being called for each match.
+ *                  It takes three parameters:
+ *                  `ValueType val, IndexType a_idx, IndexType b_idx`.
+ *                  `val` is the matching element, `*_idx` are the
+ *                  indices of the matching values from a and b.
+ */
+template <typename IndexType, typename ValueType, typename Callback>
+__dpct_inline__ void sequential_match(const ValueType* a, IndexType a_size,
+                                      const ValueType* b, IndexType b_size,
+                                      Callback match_fn)
+{
+    sequential_merge(a, a_size, b, b_size,
+                     [&](IndexType a_idx, ValueType a_val, IndexType b_idx,
+                         ValueType b_val, IndexType) {
+                         if (a_val == b_val) {
+                             match_fn(a_val, a_idx, b_idx);
+                         }
+                         return a_idx < a_size && b_idx < b_size;
+                     });
+}
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_MERGING_DP_HPP_
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index b1eb86a1992..74cbb66c451 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -114,8 +114,8 @@ __dpct_inline__ int choose_pivot(const Group& group, ValueType local_data,
 {
     using real = remove_complex<ValueType>;
     real lmag = is_pivoted ? -one<real>() : abs(local_data);
-    const auto pivot =
-        reduce(group, group.thread_rank(), [&](int lidx, int ridx) {
+    const auto pivot = ::gko::kernels::dpcpp::reduce(
+        group, group.thread_rank(), [&](int lidx, int ridx) {
             const auto rmag = group.shfl(lmag, ridx);
             if (rmag > lmag) {
                 lmag = rmag;
diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp
new file mode 100644
index 00000000000..4db6f1f7073
--- /dev/null
+++ b/dpcpp/components/searching.dp.hpp
@@ -0,0 +1,260 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_
+
+
+#include <CL/sycl.hpp>
+
+
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/intrinsics.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/**
+ * @internal
+ * Generic binary search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * The implementation is based on Stepanov & McJones, "Elements of Programming".
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Predicate>
+__dpct_inline__ IndexType binary_search(IndexType offset, IndexType length,
+                                        Predicate p)
+{
+    while (length > 0) {
+        auto half_length = length / 2;
+        auto mid = offset + half_length;
+        auto pred = p(mid);
+        length = pred ? half_length : length - (half_length + 1);
+        offset = pred ? offset : mid + 1;
+    }
+    return offset;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a fixed-size binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * @tparam size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <int size, typename Predicate>
+__dpct_inline__ int synchronous_fixed_binary_search(Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    int begin{};
+    static_assert(size > 0, "size must be positive");
+    static_assert(!(size & (size - 1)), "size must be a power of two");
+#pragma unroll
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic implementation of a synchronous binary search.
+ * The implementation makes sure that the number of predicate evaluations only
+ * depends on `length` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * @param size  the length of the partitioned range - must be a power of two
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `int` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename Predicate>
+__dpct_inline__ int synchronous_binary_search(int size, Predicate p)
+{
+    if (size == 0) {
+        return 0;
+    }
+    int begin{};
+    for (auto cur_size = size; cur_size > 1; cur_size /= 2) {
+        auto half_size = cur_size / 2;
+        auto mid = begin + half_size;
+        // invariant: [begin, begin + cur_size] contains partition point
+        begin = p(mid) ? begin : mid;
+    }
+    // cur_size is now 1, so the partition point is either begin or begin + 1
+    return p(begin) ? begin : begin + 1;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * It executes `log2(length / group.size())` coalescing calls to `p`.
+ *
+ * This implementation is based on the w-wide search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__dpct_inline__ IndexType group_wide_search(IndexType offset, IndexType length,
+                                            Group group, Predicate p)
+{
+    // binary search on the group-sized blocks
+    IndexType num_blocks = (length + group.size() - 1) / group.size();
+    auto group_pos = binary_search(IndexType{}, num_blocks, [&](IndexType i) {
+        auto idx = i * group.size();
+        return p(offset + idx);
+    });
+    // case 1: p is true everywhere: middle is at the beginning
+    if (group_pos == 0) {
+        return offset;
+    }
+    /*
+     * case 2: p is false somewhere:
+     *
+     * p(group_pos * g.size()) is true, so either this is the partition point,
+     * or the partition point is one of the g.size() - 1 previous indices.
+     *   |block group_pos-1|
+     * 0 | 0 * * * * * * * | 1
+     *       ^               ^
+     *       we load this range, with the 1 acting as a sentinel for ffs(...)
+     *
+     * additionally, this means that we can't call p out-of-bounds
+     */
+    auto base_idx = (group_pos - 1) * group.size() + 1;
+    auto idx = base_idx + group.thread_rank();
+    auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1;
+    return offset + base_idx + pos;
+}
+
+
+/**
+ * @internal
+ * Generic search that finds the first index where a predicate is true.
+ * It assumes that the predicate partitions the range [offset, offset + length)
+ * into two subranges [offset, middle), [middle, offset + length) such that
+ * the predicate is `false` for all elements in the first range and `true` for
+ * all elements in the second range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ *
+ * It executes `log2(length) / log2(group.size())` calls to `p` that effectively
+ * follow a random-access pattern.
+ *
+ * This implementation is based on the w-partition search mentioned in
+ * Green et al., "GPU merge path: a GPU merging algorithm"
+ *
+ * @param offset  the starting index of the partitioned range
+ * @param length  the length of the partitioned range
+ * @param group   the coalescing group executing the search
+ * @param p  the predicate to be evaluated on the range - it should not have
+ *           side-effects and map from `IndexType` to `bool`
+ * @returns  the index of `middle`, i.e., the partition point
+ */
+template <typename IndexType, typename Group, typename Predicate>
+__dpct_inline__ IndexType group_ary_search(IndexType offset, IndexType length,
+                                           Group group, Predicate p)
+{
+    IndexType end = offset + length;
+    // invariant: [offset, offset + length] contains middle
+    while (length > group.size()) {
+        auto stride = length / group.size();
+        auto idx = offset + group.thread_rank() * stride;
+        auto mask = group.ballot(p(idx));
+        // if the mask is 0, the partition point is in the last block
+        // if the mask is ~0, the partition point is in the first block
+        // otherwise, we go to the last block that returned a 0.
+        auto pos = mask == 0 ? group.size() - 1 : ffs(mask >> 1) - 1;
+        auto last_length = length - stride * (group.size() - 1);
+        length = pos == group.size() - 1 ? last_length : stride;
+        offset += stride * pos;
+    }
+    auto idx = offset + group.thread_rank();
+    // if the mask is 0, the partition point is at the end
+    // otherwise it is the first set bit
+    auto mask = group.ballot(idx >= end || p(idx));
+    auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
+    return offset + pos;
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_SEARCHING_DP_HPP_
diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp
new file mode 100644
index 00000000000..7754cfb260d
--- /dev/null
+++ b/dpcpp/components/warp_blas.dp.hpp
@@ -0,0 +1,474 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_
+#define GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_
+
+
+#include <cassert>
+// #include <dpcpp/base/math.hpp>
+#include <type_traits>
+
+
+#include <CL/sycl.hpp>
+
+
+#include <ginkgo/config.hpp>
+
+
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+/**
+ * @internal
+ *
+ * Defines a postprocessing transformation that should be performed on the
+ * result of a function call.
+ *
+ * @note This functionality should become useless once accessors and ranges are
+ *       in place, as they will define the storage scheme.
+ */
+enum postprocess_transformation { and_return, and_transpose };
+
+
+/**
+ * @internal
+ *
+ * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan
+ * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the
+ * thread group `group. Each thread contributes one `row` of the matrix, and the
+ * routine uses warp shuffles to exchange data between rows. The transform is
+ * performed by using the `key_row`-th row and `key_col`-th column of the
+ * matrix.
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ void apply_gauss_jordan_transform(
+    const Group& __restrict__ group, int32 key_row, int32 key_col,
+    ValueType* __restrict__ row, bool& __restrict__ status)
+{
+    auto key_col_elem = group.shfl(row[key_col], key_row);
+    if (key_col_elem == zero<ValueType>()) {
+        // TODO: implement error handling for GPUs to be able to properly
+        //       report it here
+        status = false;
+        return;
+    }
+    if (group.thread_rank() == key_row) {
+        key_col_elem = one<ValueType>() / key_col_elem;
+    } else {
+        key_col_elem = -row[key_col] / key_col_elem;
+    }
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        const auto key_row_elem = group.shfl(row[i], key_row);
+        if (group.thread_rank() == key_row) {
+            row[i] = zero<ValueType>();
+        }
+        row[i] += key_col_elem * key_row_elem;
+    }
+    row[key_col] = key_col_elem;
+}
+
+
+/**
+ * @internal
+ *
+ * Applies a Gauss-Jordan transformation (single step of Gauss-Jordan
+ * elimination) to a `max_problem_size`-by-`max_problem_size` matrix using the
+ * thread group `group. Each thread contributes one `row` of the matrix, and the
+ * routine uses warp shuffles to exchange data between rows. The transform is
+ * performed by using the `key_row`-th row and `key_col`-th column of the
+ * matrix.
+ * Works with one right hand side vector `rhs` which can be directly worked on
+ * when solving Ax = rhs without the need of storing the inverse of A.
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ void apply_gauss_jordan_transform_with_rhs(
+    const Group& __restrict__ group, int32 key_row, int32 key_col,
+    ValueType* __restrict__ row, ValueType* __restrict__ rhs,
+    bool& __restrict__ status)
+{
+    auto key_col_elem = group.shfl(row[key_col], key_row);
+    auto key_rhs_elem = group.shfl(rhs[0], key_row);
+    if (key_col_elem == zero<ValueType>()) {
+        // TODO: implement error handling for GPUs to be able to properly
+        //       report it here
+        status = false;
+        return;
+    }
+    if (group.thread_rank() == key_row) {
+        key_col_elem = one<ValueType>() / key_col_elem;
+        rhs[0] = key_rhs_elem * key_col_elem;
+    } else {
+        key_col_elem = -row[key_col] / key_col_elem;
+        rhs[0] += key_rhs_elem * key_col_elem;
+    }
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        const auto key_row_elem = group.shfl(row[i], key_row);
+        if (group.thread_rank() == key_row) {
+            row[i] = zero<ValueType>();
+        }
+        // rhs[0] += key_rhs_elem * key_row_elem;
+        row[i] += key_col_elem * key_row_elem;
+    }
+    row[key_col] = key_col_elem;
+}
+
+
+/**
+ * @internal
+ *
+ * Inverts a matrix using Gauss-Jordan elimination. The inversion is
+ * done in-place, so the original matrix will be overridden with the inverse.
+ * The inversion routine uses implicit pivoting, so the returned matrix will be
+ * a permuted inverse (from both sides). To obtain the correct inverse, the
+ * rows of the result should be permuted with $P$, and the columns with
+ * $ P^T $ (i.e. $ A^{-1} = P X P $, where $ X $ is the returned matrix). These
+ * permutation matrices are returned compressed as vectors `perm`
+ * and`trans_perm`, respectively. `i`-th value of each of the vectors is
+ * returned to thread of the group with rank `i`.
+ *
+ * @tparam max_problem_size  the maximum problem size that will be passed to the
+ *                           inversion routine (a tighter bound results in
+ *                           faster code
+ * @tparam Group  type of the group of threads
+ * @tparam ValueType  type of values stored in the matrix
+ *
+ * @param group  the group of threads which participate in the inversion
+ * @param problem_size  the actual size of the matrix (cannot be larger than
+ *                      max_problem_size)
+ * @param row  a pointer to the matrix row (i-th thread in the group should
+ *             pass the pointer to the i-th row), has to have at least
+ *             max_problem_size elements
+ * @param perm  a value to hold an element of permutation matrix $ P $
+ * @param trans_perm  a value to hold an element of permutation matrix $ P^T $
+ *
+ * @return true if the inversion succeeded, false otherwise
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ bool invert_block(const Group& __restrict__ group,
+                                  uint32 problem_size,
+                                  ValueType* __restrict__ row,
+                                  uint32& __restrict__ perm,
+                                  uint32& __restrict__ trans_perm)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    // prevent rows after problem_size to become pivots
+    auto pivoted = group.thread_rank() >= problem_size;
+    auto status = true;
+#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
+#pragma unroll
+#else
+#pragma unroll 1
+#endif
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            const auto piv = choose_pivot(group, row[i], pivoted);
+            if (group.thread_rank() == piv) {
+                perm = i;
+                pivoted = true;
+            }
+            if (group.thread_rank() == i) {
+                trans_perm = piv;
+            }
+            apply_gauss_jordan_transform<max_problem_size>(group, piv, i, row,
+                                                           status);
+        }
+    }
+    return status;
+}
+
+
+/**
+ * @internal
+ *
+ * Performs the correct index calculation for the given postprocess operation.
+ */
+template <postprocess_transformation mod, typename T1, typename T2, typename T3>
+__dpct_inline__ auto get_row_major_index(T1 row, T2 col, T3 stride) ->
+    typename std::enable_if<
+        mod != and_transpose,
+        typename std::decay<decltype(row * stride + col)>::type>::type
+{
+    return row * stride + col;
+}
+
+
+template <postprocess_transformation mod, typename T1, typename T2, typename T3>
+__dpct_inline__ auto get_row_major_index(T1 row, T2 col, T3 stride) ->
+    typename std::enable_if<
+        mod == and_transpose,
+        typename std::decay<decltype(col * stride + row)>::type>::type
+{
+    return col * stride + row;
+}
+
+
+/**
+ * @internal
+ *
+ * Copies a matrix stored as a collection of rows in different threads of the
+ * warp in a block of memory accessible by all threads in row-major order.
+ * Optionally permutes rows and columns of the matrix in the process.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam mod  the transformation to perform on the return data
+ * @tparam Group  type of the group of threads
+ * @tparam SourceValueType  type of values stored in the source matrix
+ * @tparam ResultValueType  type of values stored in the result matrix
+ *
+ * @param group  group of threads participating in the copy
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param source_row  pointer to memory used to store a row of the source matrix
+ *                    `i`-th thread of the sub-warp should pass in the `i`-th
+ *                    row of the matrix
+ * @param increment  offset between two consecutive elements of the row
+ * @param row_perm  permutation vector to apply on the rows of the matrix
+ *                  (thread `i` supplies the `i`-th value of the vector)
+ * @param col_perm  permutation vector to apply on the column of the matrix
+ *                  (thread `i` supplies the `i`-th value of the vector)
+ * @param destination  pointer to memory where the result will be stored
+ *                     (all threads supply the same value)
+ * @param stride  offset between two consecutive rows of the matrix
+ */
+template <
+    int max_problem_size, postprocess_transformation mod = and_return,
+    typename Group, typename SourceValueType, typename ResultValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ void copy_matrix(const Group& __restrict__ group,
+                                 uint32 problem_size,
+                                 const SourceValueType* __restrict__ source_row,
+                                 uint32 increment, uint32 row_perm,
+                                 uint32 col_perm,
+                                 ResultValueType* __restrict__ destination,
+                                 size_type stride)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            const auto idx = group.shfl(col_perm, i);
+            if (group.thread_rank() < problem_size) {
+                // Need to assign a variable for the source_row, or hip
+                // will use a lot of VGPRs in unroll. This might lead to
+                // problems.
+                const auto val = source_row[i * increment];
+                destination[get_row_major_index<mod>(idx, row_perm, stride)] =
+                    static_cast<ResultValueType>(val);
+            }
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Multiplies a transposed vector and a matrix stored in column-major order.
+ *
+ * In mathematical terms, performs the operation $ res^T = vec^T \cdot mtx$.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam MatrixValueType  type of values stored in the matrix
+ * @tparam VectorValueType  type of values stored in the vectors
+ *
+ * @param group  group of threads participating in the operation
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
+ *             the vector)
+ * @param mtx_row  pointer to memory used to store a row of the input matrix,
+ *                    `i`-th thread of the sub-warp should pass in the
+ *                    `i`-th row of the matrix
+ * @param mtx_increment  offset between two consecutive elements of the row
+ * @param res  pointer to a block of memory where the result will be written
+ *             (only thread 0 of the group has to supply a valid value)
+ * @param mtx_increment  offset between two consecutive elements of the result
+ */
+template <
+    int max_problem_size, typename Group, typename MatrixValueType,
+    typename VectorValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ void multiply_transposed_vec(
+    const Group& __restrict__ group, uint32 problem_size,
+    const VectorValueType& __restrict__ vec,
+    const MatrixValueType* __restrict__ mtx_row, uint32 mtx_increment,
+    VectorValueType* __restrict__ res, uint32 res_increment)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    auto mtx_elem = zero<VectorValueType>();
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            if (group.thread_rank() < problem_size) {
+                mtx_elem =
+                    static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
+            }
+            const auto out = ::gko::kernels::dpcpp::reduce(
+                group, mtx_elem * vec,
+                [](VectorValueType x, VectorValueType y) { return x + y; });
+            if (group.thread_rank() == 0) {
+                res[i * res_increment] = out;
+            }
+        }
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Multiplies a matrix and a vector stored in column-major order.
+ *
+ * In mathematical terms, performs the operation $res = mtx \cdot vec$.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam MatrixValueType  type of values stored in the matrix
+ * @tparam VectorValueType  type of values stored in the vectors
+ * @tparam Closure  type of the function used to write the result
+ *
+ * @param group  group of threads participating in the operation
+ * @param problem_size  actual size of the matrix
+ *                      (`problem_size <= max_problem_size`)
+ * @param vec  input vector to multiply (thread `i` supplies the `i`-th value of
+ *             the vector)
+ * @param mtx_row  pointer to memory used to store a row of the input matrix,
+ *                    `i`-th thread of the sub-warp should pass in the
+ *                    `i`-th row of the matrix
+ * @param mtx_increment  offset between two consecutive elements of the row
+ * @param res  pointer to a block of memory where the result will be written
+ *             (only thread 0 of the group has to supply a valid value)
+ * @param mtx_increment  offset between two consecutive elements of the result
+ * @param closure_op  Operation that is performed when writing to
+                     `res[group.thread_rank() * res_increment]` as
+                     `closure_op(res[group.thread_rank() * res_increment], out)`
+                      where `out` is the result of $mtx \cdot vec$.
+ */
+template <
+    int max_problem_size, typename Group, typename MatrixValueType,
+    typename VectorValueType, typename Closure,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ void multiply_vec(const Group& __restrict__ group,
+                                  uint32 problem_size,
+                                  const VectorValueType& __restrict__ vec,
+                                  const MatrixValueType* __restrict__ mtx_row,
+                                  uint32 mtx_increment,
+                                  VectorValueType* __restrict__ res,
+                                  uint32 res_increment, Closure closure_op)
+{
+    GKO_ASSERT(problem_size <= max_problem_size);
+    auto mtx_elem = zero<VectorValueType>();
+    auto out = zero<VectorValueType>();
+#pragma unroll
+    for (int32 i = 0; i < max_problem_size; ++i) {
+        if (i < problem_size) {
+            if (group.thread_rank() < problem_size) {
+                mtx_elem =
+                    static_cast<VectorValueType>(mtx_row[i * mtx_increment]);
+            }
+            out += mtx_elem * group.shfl(vec, i);
+        }
+    }
+    if (group.thread_rank() < problem_size) {
+        closure_op(res[group.thread_rank() * res_increment], out);
+    }
+}
+
+
+/**
+ * @internal
+ *
+ * Computes the infinity norm of a matrix. Each thread in the group supplies
+ * one row of the matrix.
+ *
+ * @tparam max_problem_size  maximum problem size passed to the routine
+ * @tparam Group  type of the group of threads
+ * @tparam ValueType  type of values stored in the matrix
+ *
+ * @param group  group of threads participating in the operation
+ * @param num_rows  number of rows of the matrix
+ *                  (`num_rows <= max_problem_size`)
+ * @param num_cols  number of columns of the matrix
+ * @param row  pointer to memory used to store a row of the input matrix,
+ *             `i`-th thread of the group should pass in the `i`-th row of the
+ *             matrix
+ *
+ * @return the infinity norm of the matrix
+ */
+template <
+    int max_problem_size, typename Group, typename ValueType,
+    typename = std::enable_if_t<group::is_communicator_group<Group>::value>>
+__dpct_inline__ remove_complex<ValueType> compute_infinity_norm(
+    const Group& group, uint32 num_rows, uint32 num_cols, const ValueType* row)
+{
+    using result_type = remove_complex<ValueType>;
+    auto sum = zero<result_type>();
+    if (group.thread_rank() < num_rows) {
+#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS
+#pragma unroll
+#else
+#pragma unroll 1
+#endif
+        for (uint32 i = 0; i < max_problem_size; ++i) {
+            if (i < num_cols) {
+                sum += abs(row[i]);
+            }
+        }
+    }
+    return ::gko::kernels::dpcpp::reduce(
+        group, sum, [](result_type x, result_type y) { return max(x, y); });
+}
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_COMPONENTS_WARP_BLAS_DP_HPP_
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 63812f8ba2b..ef4a64c59e2 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -33,21 +33,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/preconditioner/isai_kernels.hpp"
 
 
-#include <algorithm>
-#include <memory>
-
-
 #include <CL/sycl.hpp>
 
 
-#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/exception_helpers.hpp>
-#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/cooperative_groups.dp.hpp"
+#include "dpcpp/components/merging.dp.hpp"
+#include "dpcpp/components/reduction.dp.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/components/uninitialized_array.hpp"
+#include "dpcpp/components/warp_blas.dp.hpp"
 
 
 namespace gko {
@@ -55,32 +59,593 @@ namespace kernels {
 namespace dpcpp {
 /**
  * @brief The Isai preconditioner namespace.
- *
+ * @ref Isai
  * @ingroup isai
  */
 namespace isai {
 
 
-template <typename IndexType, typename Callback>
-void forall_matching(const IndexType* fst, IndexType fst_size,
-                     const IndexType* snd, IndexType snd_size,
-                     Callback cb) GKO_NOT_IMPLEMENTED;
+constexpr int subwarp_size{row_size_limit};
+constexpr int subwarps_per_block{2};
+constexpr int default_block_size{subwarps_per_block * subwarp_size};
+
+
+namespace kernel {
 
 
-template <typename ValueType, typename IndexType, typename Callable>
-void generic_generate(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Csr<ValueType, IndexType>* mtx,
-                      matrix::Csr<ValueType, IndexType>* inverse_mtx,
-                      IndexType* excess_rhs_ptrs, IndexType* excess_nz_ptrs,
-                      Callable trs_solve) GKO_NOT_IMPLEMENTED;
+/**
+ * @internal
+ *
+ * This kernel supports at most `subwarp_size` (< `warp_size`) elements per row.
+ * If there are more elements, they are simply ignored. Only the first
+ * `subwarp_size` elements are considered both for the values and for the
+ * sparsity pattern.
+ */
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType, typename Callable>
+__dpct_inline__ void generic_generate(
+    IndexType num_rows, const IndexType* __restrict__ m_row_ptrs,
+    const IndexType* __restrict__ m_col_idxs,
+    const ValueType* __restrict__ m_values,
+    const IndexType* __restrict__ i_row_ptrs,
+    const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values,
+    IndexType* __restrict__ excess_rhs_sizes,
+    IndexType* __restrict__ excess_nnz, Callable direct_solve,
+    sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                      subwarps_per_block>* storage)
+{
+    static_assert(subwarp_size >= row_size_limit, "incompatible subwarp_size");
+    const auto row =
+        thread::get_subwarp_id_flat<subwarp_size, IndexType>(item_ct1);
+
+    if (row >= num_rows) {
+        return;
+    }
+
+    const auto i_row_begin = i_row_ptrs[row];
+    const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin;
+
+    auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    const int local_id = subwarp.thread_rank();
+
+    if (i_row_size > subwarp_size) {
+        // defer long rows: store their nnz and number of matches
+        IndexType count{};
+        for (IndexType nz = 0; nz < i_row_size; ++nz) {
+            auto col = i_col_idxs[i_row_begin + nz];
+            auto m_row_begin = m_row_ptrs[col];
+            auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+            // extract the sparse submatrix consisting of the entries whose
+            // columns/rows match column indices from this row
+            group_match<subwarp_size>(
+                m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+                i_row_size, subwarp,
+                [&](IndexType, IndexType, IndexType,
+                    config::lane_mask_type matchmask,
+                    bool) { count += popcnt(matchmask); });
+        }
+        // store the dim and nnz of this sparse block
+        if (local_id == 0) {
+            excess_rhs_sizes[row] = i_row_size;
+            excess_nnz[row] = count;
+        }
+    } else {
+        // handle short rows directly: no excess
+        if (local_id == 0) {
+            excess_rhs_sizes[row] = 0;
+            excess_nnz[row] = 0;
+        }
+
+        // subwarp_size^2 storage per subwarp
+
+
+        auto dense_system_ptr =
+            *storage + (item_ct1.get_local_id(2) / subwarp_size) *
+                           subwarp_size * subwarp_size;
+        // row-major accessor
+        auto dense_system = [&](IndexType row, IndexType col) -> ValueType& {
+            return dense_system_ptr[row * subwarp_size + col];
+        };
+
+#pragma unroll
+        for (int i = 0; i < subwarp_size; ++i) {
+            dense_system(i, local_id) = zero<ValueType>();
+        }
+
+        subwarp.sync();
+
+        IndexType rhs_one_idx{};
+        for (IndexType nz = 0; nz < i_row_size; ++nz) {
+            auto col = i_col_idxs[i_row_begin + nz];
+            auto m_row_begin = m_row_ptrs[col];
+            auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+            // extract the dense submatrix consisting of the entries whose
+            // columns/rows match column indices from this row
+            group_match<subwarp_size>(
+                m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+                i_row_size, subwarp,
+                [&](IndexType, IndexType m_idx, IndexType i_idx,
+                    config::lane_mask_type, bool valid) {
+                    rhs_one_idx += popcnt(subwarp.ballot(
+                        valid && m_col_idxs[m_row_begin + m_idx] < row &&
+                        col == row));
+                    if (valid) {
+                        dense_system(nz, i_idx) = m_values[m_row_begin + m_idx];
+                    }
+                });
+        }
+
+        subwarp.sync();
+
+        // Now, read a full col of `dense_system` into local registers, which
+        // will be row elements after this (implicit) transpose
+        ValueType local_row[subwarp_size];
+#pragma unroll
+        for (int i = 0; i < subwarp_size; ++i) {
+            local_row[i] = dense_system(i, local_id);
+        }
+
+        const auto rhs =
+            direct_solve(i_row_size, local_row, subwarp, rhs_one_idx);
+
+        // Write back:
+        if (local_id < i_row_size) {
+            const auto idx = i_row_begin + local_id;
+            if (is_finite(rhs)) {
+                i_values[idx] = rhs;
+            } else {
+                i_values[idx] = i_col_idxs[idx] == row ? one<ValueType>()
+                                                       : zero<ValueType>();
+            }
+        }
+    }
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_l_inverse(
+    IndexType num_rows, const IndexType* __restrict__ m_row_ptrs,
+    const IndexType* __restrict__ m_col_idxs,
+    const ValueType* __restrict__ m_values,
+    const IndexType* __restrict__ i_row_ptrs,
+    const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values,
+    IndexType* __restrict__ excess_rhs_sizes,
+    IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                      subwarps_per_block>* storage)
+{
+    auto trs_solve =
+        [](IndexType num_elems, const ValueType* __restrict__ local_row,
+           group::thread_block_tile<subwarp_size>& subwarp, size_type) {
+            const int local_id = subwarp.thread_rank();
+            ValueType rhs = local_id == num_elems - 1 ? one<ValueType>()
+                                                      : zero<ValueType>();
+            // Solve Triangular system
+            for (int d_col = num_elems - 1; d_col >= 0; --d_col) {
+                const auto elem = local_row[d_col];
+                if (d_col == local_id) {
+                    rhs /= elem;
+                }
+
+                const ValueType bot = subwarp.shfl(rhs, d_col);
+                if (local_id < d_col) {
+                    rhs -= bot * elem;
+                }
+            }
+
+            return rhs;
+        };
+
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, excess_rhs_sizes, excess_nnz, trs_solve, item_ct1, storage);
+}
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_l_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                        sycl::queue* queue, IndexType num_rows,
+                        const IndexType* m_row_ptrs,
+                        const IndexType* m_col_idxs, const ValueType* m_values,
+                        const IndexType* i_row_ptrs,
+                        const IndexType* i_col_idxs, ValueType* i_values,
+                        IndexType* excess_rhs_sizes, IndexType* excess_nnz)
+{
+    queue->submit([&](sycl::handler& cgh) {
+        sycl::accessor<
+            UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                              subwarps_per_block>,
+            0, sycl::access_mode::read_write, sycl::access::target::local>
+            storage_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                generate_l_inverse<subwarp_size, subwarps_per_block>(
+                    num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
+                    i_col_idxs, i_values, excess_rhs_sizes, excess_nnz,
+                    item_ct1, storage_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_u_inverse(
+    IndexType num_rows, const IndexType* __restrict__ m_row_ptrs,
+    const IndexType* __restrict__ m_col_idxs,
+    const ValueType* __restrict__ m_values,
+    const IndexType* __restrict__ i_row_ptrs,
+    const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values,
+    IndexType* __restrict__ excess_rhs_sizes,
+    IndexType* __restrict__ excess_nnz, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                      subwarps_per_block>* storage)
+{
+    auto trs_solve = [](IndexType num_elems,
+                        const ValueType* __restrict__ local_row,
+                        group::thread_block_tile<subwarp_size>& subwarp,
+                        size_type) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs = local_id == 0 ? one<ValueType>() : zero<ValueType>();
+        // Solve Triangular system
+        for (int d_col = 0; d_col < num_elems; ++d_col) {
+            const auto elem = local_row[d_col];
+            if (d_col == local_id) {
+                rhs /= elem;
+            }
+
+            const ValueType top = subwarp.shfl(rhs, d_col);
+            if (d_col < local_id) {
+                rhs -= top * elem;
+            }
+        }
+
+        return rhs;
+    };
+
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, excess_rhs_sizes, excess_nnz, trs_solve, item_ct1, storage);
+}
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_u_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory,
+                        sycl::queue* queue, IndexType num_rows,
+                        const IndexType* m_row_ptrs,
+                        const IndexType* m_col_idxs, const ValueType* m_values,
+                        const IndexType* i_row_ptrs,
+                        const IndexType* i_col_idxs, ValueType* i_values,
+                        IndexType* excess_rhs_sizes, IndexType* excess_nnz)
+{
+    queue->submit([&](sycl::handler& cgh) {
+        sycl::accessor<
+            UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                              subwarps_per_block>,
+            0, sycl::access_mode::read_write, sycl::access::target::local>
+            storage_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                generate_u_inverse<subwarp_size, subwarps_per_block>(
+                    num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
+                    i_col_idxs, i_values, excess_rhs_sizes, excess_nnz,
+                    item_ct1, storage_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_general_inverse(
+    IndexType num_rows, const IndexType* __restrict__ m_row_ptrs,
+    const IndexType* __restrict__ m_col_idxs,
+    const ValueType* __restrict__ m_values,
+    const IndexType* __restrict__ i_row_ptrs,
+    const IndexType* __restrict__ i_col_idxs, ValueType* __restrict__ i_values,
+    IndexType* __restrict__ excess_rhs_sizes,
+    IndexType* __restrict__ excess_nnz, bool spd, sycl::nd_item<3> item_ct1,
+    UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                      subwarps_per_block>* storage)
+{
+    auto general_solve = [spd](IndexType num_elems,
+                               ValueType* __restrict__ local_row,
+                               group::thread_block_tile<subwarp_size>& subwarp,
+                               size_type rhs_one_idx) {
+        const int local_id = subwarp.thread_rank();
+        ValueType rhs =
+            local_id == rhs_one_idx ? one<ValueType>() : zero<ValueType>();
+        size_type perm = local_id;
+        auto pivoted = subwarp.thread_rank() >= num_elems;
+        auto status = true;
+        for (size_type i = 0; i < num_elems; i++) {
+            const auto piv = choose_pivot(subwarp, local_row[i], pivoted);
+            if (local_id == piv) {
+                pivoted = true;
+            }
+            if (local_id == i) {
+                perm = piv;
+            }
+
+            apply_gauss_jordan_transform_with_rhs<subwarp_size>(
+                subwarp, piv, i, local_row, &rhs, status);
+        }
+
+        ValueType sol = subwarp.shfl(rhs, perm);
+
+        if (spd) {
+            auto diag = subwarp.shfl(sol, num_elems - 1);
+            sol /= std::sqrt(diag);
+        }
+
+        return sol;
+    };
+
+    generic_generate<subwarp_size, subwarps_per_block>(
+        num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs, i_col_idxs,
+        i_values, excess_rhs_sizes, excess_nnz, general_solve, item_ct1,
+        storage);
+}
+
+template <int subwarp_size, int subwarps_per_block, typename ValueType,
+          typename IndexType>
+void generate_general_inverse(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    IndexType num_rows, const IndexType* m_row_ptrs,
+    const IndexType* m_col_idxs, const ValueType* m_values,
+    const IndexType* i_row_ptrs, const IndexType* i_col_idxs,
+    ValueType* i_values, IndexType* excess_rhs_sizes, IndexType* excess_nnz,
+    bool spd)
+{
+    queue->submit([&](sycl::handler& cgh) {
+        sycl::accessor<
+            UninitializedArray<ValueType, subwarp_size * subwarp_size *
+                                              subwarps_per_block>,
+            0, sycl::access_mode::read_write, sycl::access::target::local>
+            storage_acc_ct1(cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                generate_general_inverse<subwarp_size, subwarps_per_block>(
+                    num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
+                    i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, spd,
+                    item_ct1, storage_acc_ct1.get_pointer().get());
+            });
+    });
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void generate_excess_system(IndexType num_rows,
+                            const IndexType* __restrict__ m_row_ptrs,
+                            const IndexType* __restrict__ m_col_idxs,
+                            const ValueType* __restrict__ m_values,
+                            const IndexType* __restrict__ i_row_ptrs,
+                            const IndexType* __restrict__ i_col_idxs,
+                            const IndexType* __restrict__ excess_rhs_ptrs,
+                            const IndexType* __restrict__ excess_nz_ptrs,
+                            IndexType* __restrict__ excess_row_ptrs,
+                            IndexType* __restrict__ excess_col_idxs,
+                            ValueType* __restrict__ excess_values,
+                            ValueType* __restrict__ excess_rhs,
+                            size_type e_start, size_type e_end,
+                            sycl::nd_item<3> item_ct1)
+{
+    const auto row =
+        thread::get_subwarp_id_flat<subwarp_size, IndexType>(item_ct1) +
+        e_start;
+
+    if (row >= e_end) {
+        return;
+    }
+
+    const auto i_row_begin = i_row_ptrs[row];
+    const auto i_row_size = i_row_ptrs[row + 1] - i_row_begin;
+
+    auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    const int local_id = subwarp.thread_rank();
+    const auto prefix_mask = (config::lane_mask_type{1} << local_id) - 1;
+
+    if (row == e_start && local_id == 0) {
+        excess_row_ptrs[0] = 0;
+    }
+
+    if (i_row_size <= subwarp_size) {
+        return;
+    }
+
+    auto excess_rhs_begin = excess_rhs_ptrs[row];
+    auto excess_nz_begin = excess_nz_ptrs[row];
+
+    auto out_nz_begin = excess_nz_begin - excess_nz_ptrs[e_start];
+    auto out_ptrs_begin = excess_rhs_begin - excess_rhs_ptrs[e_start];
+
+    // defer long rows: store their nnz and number of matches
+    for (IndexType nz = 0; nz < i_row_size; ++nz) {
+        auto col = i_col_idxs[i_row_begin + nz];
+        auto m_row_begin = m_row_ptrs[col];
+        auto m_row_size = m_row_ptrs[col + 1] - m_row_begin;
+        // extract the sparse submatrix consisting of the entries whose
+        // columns/rows match column indices from this row
+        group_match<subwarp_size>(
+            m_col_idxs + m_row_begin, m_row_size, i_col_idxs + i_row_begin,
+            i_row_size, subwarp,
+            [&](IndexType col, IndexType m_idx, IndexType i_idx,
+                config::lane_mask_type mask, bool valid) {
+                // dense_system(nz, i_idx) = m_values[m_row_begin + m_idx]
+                // only in sparse :)
+                if (valid) {
+                    auto nz = out_nz_begin + popcnt(mask & prefix_mask);
+                    excess_col_idxs[nz] = out_ptrs_begin + i_idx;
+                    excess_values[nz] = m_values[m_row_begin + m_idx];
+                }
+                out_nz_begin += popcnt(mask);
+            });
+        if (local_id == 0) {
+            // build right-hand side: 1 for diagonal entry, 0 else
+            excess_rhs[out_ptrs_begin + nz] =
+                row == col ? one<ValueType>() : zero<ValueType>();
+            // store row pointers
+            excess_row_ptrs[out_ptrs_begin + nz + 1] = out_nz_begin;
+        }
+    }
+}
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void generate_excess_system(
+    dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue,
+    IndexType num_rows, const IndexType* m_row_ptrs,
+    const IndexType* m_col_idxs, const ValueType* m_values,
+    const IndexType* i_row_ptrs, const IndexType* i_col_idxs,
+    const IndexType* excess_rhs_ptrs, const IndexType* excess_nz_ptrs,
+    IndexType* excess_row_ptrs, IndexType* excess_col_idxs,
+    ValueType* excess_values, ValueType* excess_rhs, size_type e_start,
+    size_type e_end)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            generate_excess_system<subwarp_size>(
+                num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
+                i_col_idxs, excess_rhs_ptrs, excess_nz_ptrs, excess_row_ptrs,
+                excess_col_idxs, excess_values, excess_rhs, e_start, e_end,
+                item_ct1);
+        });
+}
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs,
+                           ValueType* __restrict__ excess_solution,
+                           size_type e_start, size_type e_end,
+                           sycl::nd_item<3> item_ct1)
+{
+    const auto warp_id =
+        thread::get_subwarp_id_flat<subwarp_size, IndexType>(item_ct1);
+    auto subwarp = group::tiled_partition<subwarp_size>(
+        group::this_thread_block(item_ct1));
+    const int local_id = subwarp.thread_rank();
+    const auto row = warp_id + e_start;
+
+    if (row >= e_end) {
+        return;
+    }
+
+    const IndexType offset = excess_block_ptrs[e_start];
+    const IndexType block_begin = excess_block_ptrs[row] - offset;
+    const IndexType block_end = excess_block_ptrs[row + 1] - offset;
+    if (block_end == block_begin) {
+        return;
+    }
+    const auto diag = excess_solution[block_end - 1];
+    const ValueType scal = one<ValueType>() / std::sqrt(diag);
+
+    for (size_type i = block_begin + local_id; i < block_end;
+         i += subwarp_size) {
+        excess_solution[i] *= scal;
+    }
+}
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void scale_excess_solution(dim3 grid, dim3 block,
+                           size_type dynamic_shared_memory, sycl::queue* queue,
+                           const IndexType* excess_block_ptrs,
+                           ValueType* excess_solution, size_type e_start,
+                           size_type e_end)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            scale_excess_solution<subwarp_size>(
+                excess_block_ptrs, excess_solution, e_start, e_end, item_ct1);
+        });
+}
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void copy_excess_solution(IndexType num_rows,
+                          const IndexType* __restrict__ i_row_ptrs,
+                          const IndexType* __restrict__ excess_rhs_ptrs,
+                          const ValueType* __restrict__ excess_solution,
+                          ValueType* __restrict__ i_values, size_type e_start,
+                          size_type e_end, sycl::nd_item<3> item_ct1)
+{
+    const auto excess_row =
+        thread::get_subwarp_id_flat<subwarp_size, IndexType>(item_ct1);
+    const auto row = excess_row + e_start;
+
+    if (row >= e_end) {
+        return;
+    }
+
+    auto local_id = item_ct1.get_local_id(2) % subwarp_size;
+
+    const auto i_row_begin = i_row_ptrs[row];
+
+    const auto excess_begin = excess_rhs_ptrs[row];
+    const auto excess_size = excess_rhs_ptrs[row + 1] - excess_begin;
+
+    // if it was handled separately:
+    if (excess_size > 0) {
+        // copy the values for this row
+        for (IndexType nz = local_id; nz < excess_size; nz += subwarp_size) {
+            i_values[nz + i_row_begin] =
+                excess_solution[nz + excess_begin - excess_rhs_ptrs[e_start]];
+        }
+    }
+}
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void copy_excess_solution(dim3 grid, dim3 block,
+                          size_type dynamic_shared_memory, sycl::queue* queue,
+                          IndexType num_rows, const IndexType* i_row_ptrs,
+                          const IndexType* excess_rhs_ptrs,
+                          const ValueType* excess_solution, ValueType* i_values,
+                          size_type e_start, size_type e_end)
+{
+    queue->parallel_for(
+        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            copy_excess_solution<subwarp_size>(
+                num_rows, i_row_ptrs, excess_rhs_ptrs, excess_solution,
+                i_values, e_start, e_end, item_ct1);
+        });
+}
+
+
+}  // namespace kernel
 
 
 template <typename ValueType, typename IndexType>
 void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Csr<ValueType, IndexType>* mtx,
-                          matrix::Csr<ValueType, IndexType>* inverse_mtx,
+                          const matrix::Csr<ValueType, IndexType>* input,
+                          matrix::Csr<ValueType, IndexType>* inverse,
                           IndexType* excess_rhs_ptrs, IndexType* excess_nz_ptrs,
-                          bool lower) GKO_NOT_IMPLEMENTED;
+                          bool lower)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    if (lower) {
+        kernel::generate_l_inverse<subwarp_size, subwarps_per_block>(
+            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            input->get_const_values(), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
+            excess_nz_ptrs);
+    } else {
+        kernel::generate_u_inverse<subwarp_size, subwarps_per_block>(
+            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            input->get_const_values(), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
+            excess_nz_ptrs);
+    }
+    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_TRI_INVERSE_KERNEL);
@@ -91,46 +656,87 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                               const matrix::Csr<ValueType, IndexType>* input,
                               matrix::Csr<ValueType, IndexType>* inverse,
                               IndexType* excess_rhs_ptrs,
-                              IndexType* excess_nz_ptrs,
-                              bool spd) GKO_NOT_IMPLEMENTED;
+                              IndexType* excess_nz_ptrs, bool spd)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
+    kernel::generate_general_inverse<subwarp_size, subwarps_per_block>(
+        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+        input->get_const_row_ptrs(), input->get_const_col_idxs(),
+        input->get_const_values(), inverse->get_row_ptrs(),
+        inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
+        excess_nz_ptrs, spd);
+    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_GENERAL_INVERSE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void generate_excess_system(std::shared_ptr<const DefaultExecutor>,
+void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
                             const matrix::Csr<ValueType, IndexType>* input,
                             const matrix::Csr<ValueType, IndexType>* inverse,
                             const IndexType* excess_rhs_ptrs,
                             const IndexType* excess_nz_ptrs,
                             matrix::Csr<ValueType, IndexType>* excess_system,
                             matrix::Dense<ValueType>* excess_rhs,
-                            size_type e_start,
-                            size_type e_end) GKO_NOT_IMPLEMENTED;
+                            size_type e_start, size_type e_end)
+{
+    const auto num_rows = input->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
+    kernel::generate_excess_system<subwarp_size>(
+        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+        input->get_const_row_ptrs(), input->get_const_col_idxs(),
+        input->get_const_values(), inverse->get_const_row_ptrs(),
+        inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs,
+        excess_system->get_row_ptrs(), excess_system->get_col_idxs(),
+        excess_system->get_values(), excess_rhs->get_values(), e_start, e_end);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_GENERATE_EXCESS_SYSTEM_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void scale_excess_solution(std::shared_ptr<const DefaultExecutor>,
+void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
                            const IndexType* excess_block_ptrs,
                            matrix::Dense<ValueType>* excess_solution,
-                           size_type e_start,
-                           size_type e_end) GKO_NOT_IMPLEMENTED;
+                           size_type e_start, size_type e_end)
+{
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
+    kernel::scale_excess_solution<subwarp_size>(
+        grid, block, 0, exec->get_queue(), excess_block_ptrs,
+        excess_solution->get_values(), e_start, e_end);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCALE_EXCESS_SOLUTION_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void scatter_excess_solution(std::shared_ptr<const DefaultExecutor>,
-                             const IndexType* excess_block_ptrs,
+void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
+                             const IndexType* excess_rhs_ptrs,
                              const matrix::Dense<ValueType>* excess_solution,
                              matrix::Csr<ValueType, IndexType>* inverse,
-                             size_type e_start,
-                             size_type e_end) GKO_NOT_IMPLEMENTED;
+                             size_type e_start, size_type e_end)
+{
+    const auto num_rows = inverse->get_size()[0];
+
+    const dim3 block(default_block_size, 1, 1);
+    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
+    kernel::copy_excess_solution<subwarp_size>(
+        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+        inverse->get_const_row_ptrs(), excess_rhs_ptrs,
+        excess_solution->get_const_values(), inverse->get_values(), e_start,
+        e_end);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ISAI_SCATTER_EXCESS_SOLUTION_KERNEL);
diff --git a/dpcpp/test/CMakeLists.txt b/dpcpp/test/CMakeLists.txt
index 8ccd05b0518..7487a520218 100644
--- a/dpcpp/test/CMakeLists.txt
+++ b/dpcpp/test/CMakeLists.txt
@@ -4,5 +4,6 @@ set(GINKGO_COMPILING_DPCPP_TEST ON)
 add_subdirectory(base)
 add_subdirectory(components)
 add_subdirectory(matrix)
+add_subdirectory(preconditioner)
 add_subdirectory(solver)
 add_subdirectory(stop)
diff --git a/dpcpp/test/preconditioner/CMakeLists.txt b/dpcpp/test/preconditioner/CMakeLists.txt
new file mode 100644
index 00000000000..5f5c09e3129
--- /dev/null
+++ b/dpcpp/test/preconditioner/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(isai_kernels)
\ No newline at end of file
diff --git a/dpcpp/test/preconditioner/isai_kernels.cpp b/dpcpp/test/preconditioner/isai_kernels.cpp
new file mode 100644
index 00000000000..73664274efe
--- /dev/null
+++ b/dpcpp/test/preconditioner/isai_kernels.cpp
@@ -0,0 +1,683 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/preconditioner/isai.hpp>
+
+
+#include <random>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/mtx_io.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/identity.hpp>
+
+
+#include "core/preconditioner/isai_kernels.hpp"
+#include "core/test/utils.hpp"
+#include "dpcpp/base/config.hpp"
+#include "matrices/config.hpp"
+
+
+namespace {
+
+
+enum struct matrix_type { lower, upper, general, spd };
+
+
+class Isai : public ::testing::Test {
+protected:
+#if GINKGO_DPCPP_SINGLE_MODE
+    using value_type = float;
+#else
+    using value_type = double;
+#endif
+    using index_type = gko::int32;
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+    Isai() : rand_engine(42) {}
+
+    void SetUp()
+    {
+        ASSERT_GT(gko::DpcppExecutor::get_num_devices("all"), 0);
+        ref = gko::ReferenceExecutor::create();
+        dpcpp = gko::DpcppExecutor::create(0, ref);
+    }
+
+    std::unique_ptr<Csr> clone_allocations(const Csr* csr_mtx)
+    {
+        if (csr_mtx->get_executor() != ref) {
+            return {nullptr};
+        }
+        const auto num_elems = csr_mtx->get_num_stored_elements();
+        auto sparsity = csr_mtx->clone();
+
+        // values are now filled with invalid data to catch potential errors
+        auto begin_values = sparsity->get_values();
+        auto end_values = begin_values + num_elems;
+        std::fill(begin_values, end_values, -gko::one<value_type>());
+        return sparsity;
+    }
+
+    void initialize_data(matrix_type type, gko::size_type n,
+                         gko::size_type row_limit)
+    {
+        const bool for_lower_tm = type == matrix_type::lower;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        mtx = Csr::create(ref);
+        if (type == matrix_type::general) {
+            auto dense_mtx = gko::test::generate_random_matrix<Dense>(
+                n, n, nz_dist, val_dist, rand_engine, ref, gko::dim<2>{n, n});
+            ensure_diagonal(dense_mtx.get());
+            mtx->copy_from(dense_mtx.get());
+        } else if (type == matrix_type::spd) {
+            auto dense_mtx = gko::test::generate_random_band_matrix<Dense>(
+                n, row_limit / 4, row_limit / 4, val_dist, rand_engine, ref,
+                gko::dim<2>{n, n});
+            auto transp = gko::as<Dense>(dense_mtx->transpose());
+            auto spd_mtx = Dense::create(ref, gko::dim<2>{n, n});
+            dense_mtx->apply(transp.get(), spd_mtx.get());
+            mtx->copy_from(spd_mtx.get());
+        } else {
+            mtx = gko::test::generate_random_triangular_matrix<Csr>(
+                n, n, true, for_lower_tm, nz_dist, val_dist, rand_engine, ref,
+                gko::dim<2>{n, n});
+        }
+        inverse = clone_allocations(mtx.get());
+
+        d_mtx = gko::clone(dpcpp, mtx);
+        d_inverse = gko::clone(dpcpp, inverse);
+    }
+
+    template <typename ReadMtx>
+    std::unique_ptr<ReadMtx> read(const char* name)
+    {
+        std::ifstream mtxstream{std::string{gko::matrices::location_isai_mtxs} +
+                                name};
+        auto result = gko::read<ReadMtx>(mtxstream, ref);
+        // to avoid removing 0s, the matrices store 12345 instead
+        for (gko::size_type i = 0; i < result->get_num_stored_elements(); ++i) {
+            auto& val = result->get_values()[i];
+            if (val == static_cast<value_type>(12345.0)) {
+                val = 0;
+            }
+        }
+        return std::move(result);
+    }
+
+    void ensure_diagonal(Dense* mtx)
+    {
+        for (int i = 0; i < mtx->get_size()[0]; ++i) {
+            mtx->at(i, i) = gko::one<Dense::value_type>();
+        }
+    }
+
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<const gko::DpcppExecutor> dpcpp;
+
+    std::default_random_engine rand_engine;
+
+    std::unique_ptr<Csr> mtx;
+    std::unique_ptr<Csr> inverse;
+
+    std::unique_ptr<Csr> d_mtx;
+    std::unique_ptr<Csr> d_inverse;
+};
+
+
+TEST_F(Isai, DpcppIsaiGenerateLinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 536, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::dpcpp::isai::generate_tri_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateUinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 615, 31);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::dpcpp::isai::generate_tri_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateAinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::general, 615, 15);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::dpcpp::isai::generate_general_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateSpdinverseShortIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 15);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::dpcpp::isai::generate_general_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 15 * r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_EQ(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 554, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::kernels::dpcpp::isai::generate_tri_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        true);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 695, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::dpcpp::isai::generate_tri_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateAinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::general, 695, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::dpcpp::isai::generate_general_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateSpdinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::Array<index_type> da1(dpcpp, num_rows + 1);
+    auto da2 = da1;
+
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::kernels::dpcpp::isai::generate_general_inverse(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_data(), da2.get_data(),
+        false);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r<value_type>::value);
+    GKO_ASSERT_ARRAY_EQ(a1, da1);
+    GKO_ASSERT_ARRAY_EQ(a2, da2);
+    ASSERT_GT(a1.get_const_data()[num_rows], 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateExcessLinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 518, 40);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    gko::Array<index_type> da2(dpcpp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::generate_excess_system(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateExcessUinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 673, 51);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    gko::Array<index_type> da2(dpcpp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::generate_excess_system(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateExcessAinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::general, 100, 51);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    gko::Array<index_type> da2(dpcpp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::generate_excess_system(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGenerateExcessSpdinverseLongIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    gko::Array<index_type> da2(dpcpp, a2);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_nnz = a2.get_data()[num_rows];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::generate_excess_system(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get(), 0, num_rows);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiGeneratePartialExcessIsEquivalentToRef)
+{
+    initialize_data(matrix_type::general, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    gko::Array<index_type> da2(dpcpp, a2);
+    auto e_dim = a1.get_data()[10] - a1.get_data()[5];
+    auto e_nnz = a2.get_data()[10] - a2.get_data()[5];
+    auto excess = Csr::create(ref, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    auto dexcess = Csr::create(dpcpp, gko::dim<2>(e_dim, e_dim), e_nnz);
+    auto de_rhs = Dense::create(dpcpp, gko::dim<2>(e_dim, 1));
+
+    gko::kernels::reference::isai::generate_excess_system(
+        ref, mtx.get(), inverse.get(), a1.get_const_data(), a2.get_const_data(),
+        excess.get(), e_rhs.get(), 5u, 10u);
+    gko::kernels::dpcpp::isai::generate_excess_system(
+        dpcpp, d_mtx.get(), d_inverse.get(), da1.get_const_data(),
+        da2.get_const_data(), dexcess.get(), de_rhs.get(), 5u, 10u);
+
+    GKO_ASSERT_MTX_EQ_SPARSITY(excess, dexcess);
+    GKO_ASSERT_MTX_NEAR(excess, dexcess, 0);
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScaleExcessSolutionIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scale_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::scale_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), 0, num_rows);
+
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScalePartialExcessSolutionIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[10] - a1.get_data()[5];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+
+    gko::kernels::reference::isai::scale_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), 5u, 10u);
+    gko::kernels::dpcpp::isai::scale_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), 5u, 10u);
+
+    GKO_ASSERT_MTX_NEAR(e_rhs, de_rhs, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScatterExcessSolutionLIsEquivalentToRef)
+{
+    initialize_data(matrix_type::lower, 572, 52);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::scatter_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0,
+        num_rows);
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScatterExcessSolutionUIsEquivalentToRef)
+{
+    initialize_data(matrix_type::upper, 702, 45);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_tri_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::scatter_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0,
+        num_rows);
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScatterExcessSolutionAIsEquivalentToRef)
+{
+    initialize_data(matrix_type::general, 702, 45);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::scatter_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0,
+        num_rows);
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScatterExcessSolutionSpdIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), false);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[num_rows];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 0, num_rows);
+    gko::kernels::dpcpp::isai::scatter_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 0,
+        num_rows);
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+TEST_F(Isai, DpcppIsaiScatterPartialExcessSolutionIsEquivalentToRef)
+{
+    initialize_data(matrix_type::spd, 100, 64);
+    const auto num_rows = mtx->get_size()[0];
+    gko::Array<index_type> a1(ref, num_rows + 1);
+    auto a2 = a1;
+    gko::kernels::reference::isai::generate_general_inverse(
+        ref, mtx.get(), inverse.get(), a1.get_data(), a2.get_data(), true);
+    gko::Array<index_type> da1(dpcpp, a1);
+    auto e_dim = a1.get_data()[10] - a1.get_data()[5];
+    auto e_rhs = Dense::create(ref, gko::dim<2>(e_dim, 1));
+    std::fill_n(e_rhs->get_values(), e_dim, 123456);
+    auto de_rhs = gko::clone(dpcpp, e_rhs);
+    // overwrite -1 values with inverse
+    d_inverse->copy_from(lend(inverse));
+
+    gko::kernels::reference::isai::scatter_excess_solution(
+        ref, a1.get_const_data(), e_rhs.get(), inverse.get(), 5u, 10u);
+    gko::kernels::dpcpp::isai::scatter_excess_solution(
+        dpcpp, da1.get_const_data(), de_rhs.get(), d_inverse.get(), 5u, 10u);
+
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 0);
+    ASSERT_GT(e_dim, 0);
+}
+
+
+}  // namespace
diff --git a/hip/preconditioner/isai_kernels.hip.cpp b/hip/preconditioner/isai_kernels.hip.cpp
index 77090304654..31d7140692d 100644
--- a/hip/preconditioner/isai_kernels.hip.cpp
+++ b/hip/preconditioner/isai_kernels.hip.cpp
@@ -46,11 +46,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/base/config.hip.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
+#include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
+#include "hip/components/warp_blas.hip.hpp"
 
 
 namespace gko {
@@ -69,8 +71,6 @@ constexpr int subwarps_per_block{2};
 constexpr int default_block_size{subwarps_per_block * subwarp_size};
 
 
-#include "common/cuda_hip/components/atomic.hpp.inc"
-#include "common/cuda_hip/components/warp_blas.hpp.inc"
 #include "common/cuda_hip/preconditioner/isai_kernels.hpp.inc"
 
 

From 26e391f3cb56bc036ff25784627d76d75c11a258 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 9 Feb 2022 15:31:34 +0100
Subject: [PATCH 2/6] use std for limit, fix ffs, kernel guard

---
 dev_tools/oneapi/convert_source.sh       |   2 +
 dpcpp/base/math.hpp                      |  61 --------------
 dpcpp/components/intrinsics.dp.hpp       |  12 ++-
 dpcpp/components/merging.dp.hpp          |   5 +-
 dpcpp/preconditioner/isai_kernels.dp.cpp | 101 +++++++++++++----------
 5 files changed, 70 insertions(+), 111 deletions(-)
 delete mode 100644 dpcpp/base/math.hpp

diff --git a/dev_tools/oneapi/convert_source.sh b/dev_tools/oneapi/convert_source.sh
index 209b9d7e8e6..fe55181a52f 100755
--- a/dev_tools/oneapi/convert_source.sh
+++ b/dev_tools/oneapi/convert_source.sh
@@ -301,6 +301,8 @@ replace_regex="${replace_regex};s|trick/thread_ids.hpp|dpcpp/components/thread_i
 replace_regex="${replace_regex};s|trick/cooperative_groups\.hpp|dpcpp/components/cooperative_groups.dp.hpp|g"
 replace_regex="${replace_regex};s|trick/sorting\.hpp|dpcpp/components/sorting.dp.hpp|g"
 replace_regex="${replace_regex};s|trick/reduction\.hpp|dpcpp/components/reduction.dp.hpp|g"
+replace_regex="${replace_regex};s|cuda/base/math\.hpp|limits|g"
+replace_regex="${replace_regex};s|device_numeric_limits(<.*>)::(.*)(,|;)|std::numeric_limits\1::\2()\3|g"
 replace_regex="${replace_regex};s/#define GET_QUEUE 0//g"
 replace_regex="${replace_regex};s/GET_QUEUE/exec->get_queue()/g"
 replace_regex="${replace_regex};s/cuda/dpcpp/g"
diff --git a/dpcpp/base/math.hpp b/dpcpp/base/math.hpp
deleted file mode 100644
index a9fa1a38a39..00000000000
--- a/dpcpp/base/math.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-******************************<GINKGO LICENSE>*******************************/
-
-#ifndef GKO_DPCPP_BASE_MATH_HPP_
-#define GKO_DPCPP_BASE_MATH_HPP_
-
-
-#include <ginkgo/core/base/math.hpp>
-
-
-#include <complex>
-#include <limits>
-
-
-namespace gko {
-namespace kernels {
-namespace dpcpp {
-
-
-template <typename T>
-struct device_numeric_limits {
-    static constexpr auto inf = std::numeric_limits<T>::infinity();
-    static constexpr auto max = std::numeric_limits<T>::max();
-    static constexpr auto min = std::numeric_limits<T>::min();
-};
-
-
-}  // namespace dpcpp
-}  // namespace kernels
-}  // namespace gko
-
-#endif  // GKO_DPCPP_BASE_MATH_HPP_
diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp
index c0e5a445120..2466666b37d 100644
--- a/dpcpp/components/intrinsics.dp.hpp
+++ b/dpcpp/components/intrinsics.dp.hpp
@@ -62,11 +62,19 @@ __dpct_inline__ int popcnt(uint64 mask) { return sycl::popcount(mask); }
  * @internal
  * Returns the (1-based!) index of the first set bit in the given mask,
  * starting from the least significant bit.
+ *
+ * @note can not use length - clz because subgroup size is not always 32 or 64
  */
-__dpct_inline__ int ffs(uint32 mask) { return 32 - sycl::clz(mask); }
+__dpct_inline__ int ffs(uint32 mask)
+{
+    return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1);
+}
 
 /** @copydoc ffs */
-__dpct_inline__ int ffs(uint64 mask) { return 64 - sycl::clz(mask); }
+__dpct_inline__ int ffs(uint64 mask)
+{
+    return (mask == 0) ? 0 : (sycl::ext::intel::ctz(mask) + 1);
+}
 
 
 /**
diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp
index 67ee87f1b60..5303b11ea0f 100644
--- a/dpcpp/components/merging.dp.hpp
+++ b/dpcpp/components/merging.dp.hpp
@@ -42,7 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/base/utils.hpp"
 #include "dpcpp/base/dpct.hpp"
-#include "dpcpp/base/math.hpp"
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/searching.dp.hpp"
 
@@ -165,7 +164,7 @@ __dpct_inline__ void group_merge(const ValueType* __restrict__ a,
     IndexType a_begin{};
     IndexType b_begin{};
     auto lane = static_cast<IndexType>(group.thread_rank());
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = std::numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
@@ -275,7 +274,7 @@ __dpct_inline__ void sequential_merge(const ValueType* __restrict__ a,
     auto c_size = a_size + b_size;
     IndexType a_begin{};
     IndexType b_begin{};
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = std::numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index ef4a64c59e2..a6548d77aab 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -626,22 +626,24 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = input->get_size()[0];
 
-    const dim3 block(default_block_size, 1, 1);
-    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
-    if (lower) {
-        kernel::generate_l_inverse<subwarp_size, subwarps_per_block>(
-            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
-            input->get_const_row_ptrs(), input->get_const_col_idxs(),
-            input->get_const_values(), inverse->get_row_ptrs(),
-            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
-            excess_nz_ptrs);
-    } else {
-        kernel::generate_u_inverse<subwarp_size, subwarps_per_block>(
-            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
-            input->get_const_row_ptrs(), input->get_const_col_idxs(),
-            input->get_const_values(), inverse->get_row_ptrs(),
-            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
-            excess_nz_ptrs);
+    const auto block = default_block_size;
+    const auto grid = ceildiv(num_rows, block / subwarp_size);
+    if (grid > 0) {
+        if (lower) {
+            kernel::generate_l_inverse<subwarp_size, subwarps_per_block>(
+                grid, block, 0, exec->get_queue(),
+                static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
+                input->get_const_col_idxs(), input->get_const_values(),
+                inverse->get_row_ptrs(), inverse->get_col_idxs(),
+                inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs);
+        } else {
+            kernel::generate_u_inverse<subwarp_size, subwarps_per_block>(
+                grid, block, 0, exec->get_queue(),
+                static_cast<IndexType>(num_rows), input->get_const_row_ptrs(),
+                input->get_const_col_idxs(), input->get_const_values(),
+                inverse->get_row_ptrs(), inverse->get_col_idxs(),
+                inverse->get_values(), excess_rhs_ptrs, excess_nz_ptrs);
+        }
     }
     components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
     components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
@@ -660,14 +662,16 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = input->get_size()[0];
 
-    const dim3 block(default_block_size, 1, 1);
-    const dim3 grid(ceildiv(num_rows, block.x / subwarp_size), 1, 1);
-    kernel::generate_general_inverse<subwarp_size, subwarps_per_block>(
-        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
-        input->get_const_row_ptrs(), input->get_const_col_idxs(),
-        input->get_const_values(), inverse->get_row_ptrs(),
-        inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
-        excess_nz_ptrs, spd);
+    const auto block = default_block_size;
+    const auto grid = ceildiv(num_rows, block / subwarp_size);
+    if (grid > 0) {
+        kernel::generate_general_inverse<subwarp_size, subwarps_per_block>(
+            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            input->get_const_values(), inverse->get_row_ptrs(),
+            inverse->get_col_idxs(), inverse->get_values(), excess_rhs_ptrs,
+            excess_nz_ptrs, spd);
+    }
     components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
     components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
 }
@@ -688,15 +692,18 @@ void generate_excess_system(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = input->get_size()[0];
 
-    const dim3 block(default_block_size, 1, 1);
-    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
-    kernel::generate_excess_system<subwarp_size>(
-        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
-        input->get_const_row_ptrs(), input->get_const_col_idxs(),
-        input->get_const_values(), inverse->get_const_row_ptrs(),
-        inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs,
-        excess_system->get_row_ptrs(), excess_system->get_col_idxs(),
-        excess_system->get_values(), excess_rhs->get_values(), e_start, e_end);
+    const auto block = default_block_size;
+    const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
+    if (grid > 0) {
+        kernel::generate_excess_system<subwarp_size>(
+            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+            input->get_const_row_ptrs(), input->get_const_col_idxs(),
+            input->get_const_values(), inverse->get_const_row_ptrs(),
+            inverse->get_const_col_idxs(), excess_rhs_ptrs, excess_nz_ptrs,
+            excess_system->get_row_ptrs(), excess_system->get_col_idxs(),
+            excess_system->get_values(), excess_rhs->get_values(), e_start,
+            e_end);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -709,11 +716,13 @@ void scale_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
                            matrix::Dense<ValueType>* excess_solution,
                            size_type e_start, size_type e_end)
 {
-    const dim3 block(default_block_size, 1, 1);
-    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
-    kernel::scale_excess_solution<subwarp_size>(
-        grid, block, 0, exec->get_queue(), excess_block_ptrs,
-        excess_solution->get_values(), e_start, e_end);
+    const auto block = default_block_size;
+    const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
+    if (grid > 0) {
+        kernel::scale_excess_solution<subwarp_size>(
+            grid, block, 0, exec->get_queue(), excess_block_ptrs,
+            excess_solution->get_values(), e_start, e_end);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -729,13 +738,15 @@ void scatter_excess_solution(std::shared_ptr<const DefaultExecutor> exec,
 {
     const auto num_rows = inverse->get_size()[0];
 
-    const dim3 block(default_block_size, 1, 1);
-    const dim3 grid(ceildiv(e_end - e_start, block.x / subwarp_size), 1, 1);
-    kernel::copy_excess_solution<subwarp_size>(
-        grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
-        inverse->get_const_row_ptrs(), excess_rhs_ptrs,
-        excess_solution->get_const_values(), inverse->get_values(), e_start,
-        e_end);
+    const auto block = default_block_size;
+    const auto grid = ceildiv(e_end - e_start, block / subwarp_size);
+    if (grid > 0) {
+        kernel::copy_excess_solution<subwarp_size>(
+            grid, block, 0, exec->get_queue(), static_cast<IndexType>(num_rows),
+            inverse->get_const_row_ptrs(), excess_rhs_ptrs,
+            excess_solution->get_const_values(), inverse->get_values(), e_start,
+            e_end);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

From 69ec659d6ee54f15d6847083ab715b4ac62e8808 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 7 Mar 2022 14:55:21 +0100
Subject: [PATCH 3/6] sycl 2020 - reqd subgroup in parallel_for

---
 dpcpp/preconditioner/isai_kernels.dp.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index a6548d77aab..1b6c33e4500 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -262,7 +262,9 @@ void generate_l_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             storage_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subwarp_size)]] {
                 generate_l_inverse<subwarp_size, subwarps_per_block>(
                     num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
                     i_col_idxs, i_values, excess_rhs_sizes, excess_nnz,
@@ -330,7 +332,9 @@ void generate_u_inverse(dim3 grid, dim3 block, size_type dynamic_shared_memory,
             storage_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subwarp_size)]] {
                 generate_u_inverse<subwarp_size, subwarps_per_block>(
                     num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
                     i_col_idxs, i_values, excess_rhs_sizes, excess_nnz,
@@ -410,7 +414,9 @@ void generate_general_inverse(
             storage_acc_ct1(cgh);
 
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subwarp_size)]] {
                 generate_general_inverse<subwarp_size, subwarps_per_block>(
                     num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
                     i_col_idxs, i_values, excess_rhs_sizes, excess_nnz, spd,
@@ -509,7 +515,8 @@ void generate_excess_system(
     size_type e_end)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             generate_excess_system<subwarp_size>(
                 num_rows, m_row_ptrs, m_col_idxs, m_values, i_row_ptrs,
                 i_col_idxs, excess_rhs_ptrs, excess_nz_ptrs, excess_row_ptrs,
@@ -558,7 +565,8 @@ void scale_excess_solution(dim3 grid, dim3 block,
                            size_type e_end)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             scale_excess_solution<subwarp_size>(
                 excess_block_ptrs, excess_solution, e_start, e_end, item_ct1);
         });
@@ -606,7 +614,8 @@ void copy_excess_solution(dim3 grid, dim3 block,
                           size_type e_start, size_type e_end)
 {
     queue->parallel_for(
-        sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+        sycl_nd_range(grid, block), [=
+    ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(subwarp_size)]] {
             copy_excess_solution<subwarp_size>(
                 num_rows, i_row_ptrs, excess_rhs_ptrs, excess_solution,
                 i_values, e_start, e_end, item_ct1);

From c20b44d38c2d321b20c1eccd6a70f4bdeebc7b03 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 7 Mar 2022 14:59:14 +0100
Subject: [PATCH 4/6] update documentation

Co-authored-by: Fritz Goebel <goebel.fritz@googlemail.com>
---
 common/cuda_hip/components/searching.hpp.inc | 24 ++++++++++----------
 dpcpp/components/searching.dp.hpp            | 24 ++++++++++----------
 dpcpp/preconditioner/isai_kernels.dp.cpp     |  2 --
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/common/cuda_hip/components/searching.hpp.inc b/common/cuda_hip/components/searching.hpp.inc
index 37c41c8aab6..acc06a5d5a4 100644
--- a/common/cuda_hip/components/searching.hpp.inc
+++ b/common/cuda_hip/components/searching.hpp.inc
@@ -66,12 +66,12 @@ __forceinline__ __device__ IndexType binary_search(IndexType offset,
  * @internal
  * Generic implementation of a fixed-size binary search.
  * The implementation makes sure that the number of predicate evaluations only
- * depends on `length` and not on the actual position of the partition point.
- * It assumes that the predicate partitions the range [offset, offset + length)
- * into two subranges [offset, middle), [middle, offset + length) such that
- * the predicate is `false` for all elements in the first range and `true` for
- * all elements in the second range. `middle` is called the partition point.
- * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * depends on `size` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [0, size) into two
+ * subranges [0, middle), [middle, size) such that the predicate is `false` for
+ * all elements in the first range and `true` for all elements in the second
+ * range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `size`.
  *
  * @tparam size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
@@ -103,12 +103,12 @@ __forceinline__ __device__ int synchronous_fixed_binary_search(Predicate p)
  * @internal
  * Generic implementation of a synchronous binary search.
  * The implementation makes sure that the number of predicate evaluations only
- * depends on `length` and not on the actual position of the partition point.
- * It assumes that the predicate partitions the range [offset, offset + length)
- * into two subranges [offset, middle), [middle, offset + length) such that
- * the predicate is `false` for all elements in the first range and `true` for
- * all elements in the second range. `middle` is called the partition point.
- * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * depends on `size` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [0, size) into two
+ * subranges [0, middle), [middle, size) such that the predicate is `false` for
+ * all elements in the first range and `true` for all elements in the second
+ * range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `size`.
  *
  * @param size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp
index 4db6f1f7073..8a28579f6c5 100644
--- a/dpcpp/components/searching.dp.hpp
+++ b/dpcpp/components/searching.dp.hpp
@@ -82,12 +82,12 @@ __dpct_inline__ IndexType binary_search(IndexType offset, IndexType length,
  * @internal
  * Generic implementation of a fixed-size binary search.
  * The implementation makes sure that the number of predicate evaluations only
- * depends on `length` and not on the actual position of the partition point.
- * It assumes that the predicate partitions the range [offset, offset + length)
- * into two subranges [offset, middle), [middle, offset + length) such that
- * the predicate is `false` for all elements in the first range and `true` for
- * all elements in the second range. `middle` is called the partition point.
- * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * depends on `size` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [0, size) into two
+ * subranges [0, middle), [middle, size) such that the predicate is `false` for
+ * all elements in the first range and `true` for all elements in the second
+ * range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `size`.
  *
  * @tparam size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
@@ -119,12 +119,12 @@ __dpct_inline__ int synchronous_fixed_binary_search(Predicate p)
  * @internal
  * Generic implementation of a synchronous binary search.
  * The implementation makes sure that the number of predicate evaluations only
- * depends on `length` and not on the actual position of the partition point.
- * It assumes that the predicate partitions the range [offset, offset + length)
- * into two subranges [offset, middle), [middle, offset + length) such that
- * the predicate is `false` for all elements in the first range and `true` for
- * all elements in the second range. `middle` is called the partition point.
- * If the predicate is `false` everywhere, `middle` equals `offset + length`.
+ * depends on `size` and not on the actual position of the partition point.
+ * It assumes that the predicate partitions the range [0, size) into two
+ * subranges [0, middle), [middle, size) such that the predicate is `false` for
+ * all elements in the first range and `true` for all elements in the second
+ * range. `middle` is called the partition point.
+ * If the predicate is `false` everywhere, `middle` equals `size`.
  *
  * @param size  the length of the partitioned range - must be a power of two
  * @param p  the predicate to be evaluated on the range - it should not have
diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp
index 1b6c33e4500..be16e3f7070 100644
--- a/dpcpp/preconditioner/isai_kernels.dp.cpp
+++ b/dpcpp/preconditioner/isai_kernels.dp.cpp
@@ -139,8 +139,6 @@ __dpct_inline__ void generic_generate(
         }
 
         // subwarp_size^2 storage per subwarp
-
-
         auto dense_system_ptr =
             *storage + (item_ct1.get_local_id(2) / subwarp_size) *
                            subwarp_size * subwarp_size;

From 3cd455066d3b0a2d99562c56dba27d5c790218b0 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Mon, 7 Mar 2022 17:15:04 +0100
Subject: [PATCH 5/6] cygwin -j2 for out of memory allocation issue

---
 .github/workflows/windows-cygwin.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/windows-cygwin.yml b/.github/workflows/windows-cygwin.yml
index 43b87414cf3..16c48aaf425 100644
--- a/.github/workflows/windows-cygwin.yml
+++ b/.github/workflows/windows-cygwin.yml
@@ -53,7 +53,7 @@ jobs:
         mkdir build
         cd build
         bash -c "cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .."
-        bash -c "make -j4"
+        bash -c "make -j2"
         bash -c "ctest . --output-on-failure"
       shell: cmd
 

From a5b28d7732529cb107f2c7194ff8b7854f9e5716 Mon Sep 17 00:00:00 2001
From: ginkgo-bot <ginkgo.library@gmail.com>
Date: Tue, 8 Mar 2022 09:05:09 +0000
Subject: [PATCH 6/6] Format files

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@users.noreply.github.com>
---
 dpcpp/components/intrinsics.dp.hpp         | 2 +-
 dpcpp/components/merging.dp.hpp            | 2 +-
 dpcpp/components/searching.dp.hpp          | 2 +-
 dpcpp/components/warp_blas.dp.hpp          | 2 +-
 dpcpp/test/preconditioner/isai_kernels.cpp | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpcpp/components/intrinsics.dp.hpp b/dpcpp/components/intrinsics.dp.hpp
index 2466666b37d..61a26e7555b 100644
--- a/dpcpp/components/intrinsics.dp.hpp
+++ b/dpcpp/components/intrinsics.dp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/components/merging.dp.hpp b/dpcpp/components/merging.dp.hpp
index 5303b11ea0f..3b20b6fd5c6 100644
--- a/dpcpp/components/merging.dp.hpp
+++ b/dpcpp/components/merging.dp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/components/searching.dp.hpp b/dpcpp/components/searching.dp.hpp
index 8a28579f6c5..df65f8ae93c 100644
--- a/dpcpp/components/searching.dp.hpp
+++ b/dpcpp/components/searching.dp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/components/warp_blas.dp.hpp b/dpcpp/components/warp_blas.dp.hpp
index 7754cfb260d..b31fd0ef4fd 100644
--- a/dpcpp/components/warp_blas.dp.hpp
+++ b/dpcpp/components/warp_blas.dp.hpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/dpcpp/test/preconditioner/isai_kernels.cpp b/dpcpp/test/preconditioner/isai_kernels.cpp
index 73664274efe..fe166c13f59 100644
--- a/dpcpp/test/preconditioner/isai_kernels.cpp
+++ b/dpcpp/test/preconditioner/isai_kernels.cpp
@@ -1,5 +1,5 @@
 /*******************************<GINKGO LICENSE>******************************
-Copyright (c) 2017-2021, the Ginkgo authors
+Copyright (c) 2017-2022, the Ginkgo authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without