From 74e2b3111406867ea60bf7c71195303d5147747d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 31 Mar 2022 12:59:46 +0200
Subject: [PATCH 1/5] unify HIP/CUDA common kernels

---
 common/cuda_hip/base/kernel_launch.hpp.inc    |  82 +++
 .../base/kernel_launch_reduction.hpp.inc      | 507 ++++++++++++++++++
 .../base/kernel_launch_solver.hpp.inc         |  61 +++
 cuda/base/kernel_launch.cuh                   |  51 +-
 cuda/base/kernel_launch_reduction.cuh         | 477 +---------------
 cuda/base/kernel_launch_solver.cuh            |  30 +-
 hip/base/kernel_launch.hip.hpp                |  53 +-
 hip/base/kernel_launch_reduction.hip.hpp      | 488 +----------------
 hip/base/kernel_launch_solver.hip.hpp         |  32 +-
 9 files changed, 656 insertions(+), 1125 deletions(-)
 create mode 100644 common/cuda_hip/base/kernel_launch.hpp.inc
 create mode 100644 common/cuda_hip/base/kernel_launch_reduction.hpp.inc
 create mode 100644 common/cuda_hip/base/kernel_launch_solver.hpp.inc
diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp.inc
new file mode 100644
index 00000000000..e4c74769fcf
--- /dev/null
+++ b/common/cuda_hip/base/kernel_launch.hpp.inc
@@ -0,0 +1,82 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename KernelFunction, typename... KernelArgs>
+__global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
+    int64 size, KernelFunction fn, KernelArgs... args)
+{
+    auto tidx = thread::get_thread_id_flat<int64>();
+    if (tidx >= size) {
+        return;
+    }
+    fn(tidx, args...);
+}
+
+
+template <typename KernelFunction, typename... KernelArgs>
+__global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
+{
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto col = tidx % cols;
+    auto row = tidx / cols;
+    if (row >= rows) {
+        return;
+    }
+    fn(row, col, args...);
+}
+
+
+template <typename KernelFunction, typename... KernelArgs>
+void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
+                size_type size, KernelArgs&&... args)
+{
+    if (size > 0) {
+        constexpr auto block_size = default_block_size;
+        auto num_blocks = ceildiv(size, block_size);
+        generic_kernel_1d<<<num_blocks, block_size>>>(
+            static_cast<int64>(size), fn, map_to_device(args)...);
+    }
+}
+
+template <typename KernelFunction, typename... KernelArgs>
+void run_kernel(std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
+                dim<2> size, KernelArgs&&... args)
+{
+    if (size[0] > 0 && size[1] > 0) {
+        constexpr auto block_size = default_block_size;
+        auto num_blocks = ceildiv(size[0] * size[1], block_size);
+        generic_kernel_2d<<<num_blocks, block_size>>>(
+            static_cast<int64>(size[0]), static_cast<int64>(size[1]), fn,
+            map_to_device(args)...);
+    }
+}
diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
new file mode 100644
index 00000000000..28c9d8ba4fd
--- /dev/null
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
@@ -0,0 +1,507 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_1d(int64 size,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < size; i += grid_size) {
+        partial = op(partial, fn(i, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__ __launch_bounds__(
+    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
+                                                         KernelFunction fn,
+                                                         ReductionOp op,
+                                                         FinalizeOp finalize,
+                                                         ValueType identity,
+                                                         ValueType* storage,
+                                                         KernelArgs... args)
+{
+    __shared__
+        UninitializedArray<ValueType, default_block_size / config::warp_size>
+            warp_partial;
+    static_assert(default_block_size / config::warp_size <= config::warp_size,
+                  "needs third reduction level");
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto grid_size = thread::get_thread_num_flat<int64>();
+    auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (int64 i = tidx; i < rows * cols; i += grid_size) {
+        const auto row = i / cols;
+        const auto col = i % cols;
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(warp, partial, op);
+    if (warp.thread_rank() == 0) {
+        warp_partial[threadIdx.x / config::warp_size] = partial;
+    }
+    __syncthreads();
+    if (threadIdx.x < config::warp_size) {
+        partial = reduce(warp,
+                         threadIdx.x < default_block_size / config::warp_size
+                             ? warp_partial[threadIdx.x]
+                             : identity,
+                         op);
+        if (threadIdx.x == 0) {
+            storage[blockIdx.x] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    constexpr auto block_size = default_block_size;
+    const auto num_blocks = std::min<int64>(
+        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
+            static_cast<int64>(size), fn, op,
+            [] __device__(auto v) { return v; }, as_device_type(identity),
+            as_device_type(partial.get_data()), map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_device_type(identity), as_device_type(result),
+            as_device_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(size), fn, op, finalize,
+            as_device_type(identity), as_device_type(result),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    constexpr int oversubscription = 16;
+    constexpr auto block_size = default_block_size;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks =
+        std::min<int64>(ceildiv(rows * cols, block_size),
+                        exec->get_num_warps() * oversubscription);
+    if (num_blocks > 1) {
+        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
+            rows, cols, fn, op, [] __device__(auto v) { return v; },
+            as_device_type(identity), as_device_type(partial.get_data()),
+            map_to_device(args)...);
+        generic_kernel_reduction_1d<<<1, block_size>>>(
+            static_cast<int64>(num_blocks),
+            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
+            as_device_type(identity), as_device_type(result),
+            as_device_type(partial.get_const_data()));
+    } else {
+        generic_kernel_reduction_2d<<<1, block_size>>>(
+            rows, cols, fn, op, finalize, as_device_type(identity),
+            as_device_type(result), map_to_device(args)...);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
+        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
+        ReductionOp op, FinalizeOp finalize, ValueType identity,
+        ValueType* result, int64 result_stride, KernelArgs... args)
+{
+    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto row = idx % rows;
+    const auto col_block = idx / rows;
+    if (col_block >= col_blocks) {
+        return;
+    }
+    const auto cols_per_part =
+        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
+    const auto begin = cols_per_part * col_block;
+    const auto end = min(begin + cols_per_part, cols);
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    auto partial = identity;
+    for (auto col = begin + subwarp.thread_rank(); col < end;
+         col += subwarp_size) {
+        partial = op(partial, fn(row, col, args...));
+    }
+    partial = reduce(subwarp, partial, op);
+    if (subwarp.thread_rank() == 0) {
+        result[(row + col_block * rows) * result_stride] = finalize(partial);
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    constexpr auto warps_per_block = default_block_size / warp_size;
+    // stores the subwarp_size partial sums from each warp, grouped by warp
+    constexpr auto shared_storage = warps_per_block * subwarp_size;
+    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
+    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
+    const auto local_warp_id = threadIdx.x / warp_size;
+    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
+    const auto subwarp_num =
+        thread::get_subwarp_num_flat<subwarp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto subwarp_rank = warp_rank % subwarp_size;
+    const auto col = static_cast<int64>(subwarp_rank);
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    // accumulate between all subwarps in the warp
+#pragma unroll
+    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+        partial = op(partial, warp.shfl_xor(partial, i));
+    }  // store the result to shared memory
+    if (local_subwarp_id == 0) {
+        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
+    }
+    block.sync();
+    // in a single thread: accumulate the results
+    if (local_warp_id == 0) {
+        partial = identity;
+        // accumulate the partial results within a thread
+        if (shared_storage >= warp_size) {
+#pragma unroll
+            for (int i = 0; i < shared_storage; i += warp_size) {
+                partial = op(partial, block_partial[i + warp_rank]);
+            }
+        } else if (warp_rank < shared_storage) {
+            partial = op(partial, block_partial[warp_rank]);
+        }
+        // accumulate between all subwarps in the warp
+#pragma unroll
+        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
+            partial = op(partial, warp.shfl_xor(partial, i));
+        }
+        if (warp_rank < cols) {
+            result[warp_rank + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
+        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, ValueType* result,
+        KernelArgs... args)
+{
+    constexpr auto warp_size = config::warp_size;
+    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
+    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
+    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
+    const auto block = group::this_thread_block();
+    const auto warp = group::tiled_partition<warp_size>(block);
+    const auto warp_rank = warp.thread_rank();
+    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
+    auto partial = identity;
+    // accumulate within a thread
+    if (col < cols) {
+        for (auto row = warp_id; row < rows; row += warp_num) {
+            partial = op(partial, fn(row, col, args...));
+        }
+    }
+    block_partial[threadIdx.x] = partial;
+    block.sync();
+    // in a single warp: accumulate the results
+    if (threadIdx.x < warp_size) {
+        partial = identity;
+        // accumulate the partial results within a thread
+#pragma unroll
+        for (int i = 0; i < default_block_size; i += warp_size) {
+            partial = op(partial, block_partial[i + warp_rank]);
+        }
+        if (col < cols) {
+            result[col + blockIdx.x * cols] = finalize(partial);
+        }
+    }
+}
+
+
+template <typename ValueType, typename ReductionOp, typename FinalizeOp>
+__global__
+    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
+        int64 num_results, int64 num_blocks, ReductionOp op,
+        FinalizeOp finalize, ValueType identity, const ValueType* input,
+        int64 result_stride, ValueType* result)
+{
+    const auto idx = thread::get_thread_id_flat<int64>();
+    if (idx >= num_results) {
+        return;
+    }
+    auto partial = identity;
+    for (int64 block = 0; block < num_blocks; block++) {
+        partial = op(partial, input[idx + block * num_results]);
+    }
+    result[idx * result_stride] = finalize(partial);
+}
+
+
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
+void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+                                      int64 rows, int64 cols, int64 col_blocks,
+                                      KernelFunction fn, ReductionOp op,
+                                      FinalizeOp finalize, ValueType identity,
+                                      ValueType* result, int64 result_stride,
+                                      KernelArgs... args)
+{
+    const auto num_blocks =
+        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
+    if (num_blocks > 0) {
+        generic_kernel_row_reduction_2d<subwarp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, col_blocks, fn, op, finalize,
+                as_device_type(identity), as_device_type(result), result_stride,
+                args...);
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
+                                    run_generic_kernel_row_reduction);
+
+
+template <int subwarp_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
+          typename... MappedKernelArgs>
+void run_generic_col_reduction_small(
+    syn::value_list<int, subwarp_size>, int64 max_blocks,
+    std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
+    ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result,
+    dim<2> size, MappedKernelArgs... args)
+{
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto num_blocks = std::min<int64>(
+        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
+    if (num_blocks <= 1) {
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<1, default_block_size>>>(rows, cols, fn, op, finalize,
+                                        as_device_type(identity),
+                                        as_device_type(result), args...);
+    } else {
+        Array<ValueType> tmp_storage{exec,
+                                     static_cast<size_type>(num_blocks * cols)};
+        generic_kernel_col_reduction_2d_small<subwarp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
+                as_device_type(identity),
+                as_device_type(tmp_storage.get_data()), args...);
+        if (cols > 0) {
+            generic_kernel_reduction_finalize_2d<<<
+                ceildiv(cols, default_block_size), default_block_size>>>(
+                cols, num_blocks, op, finalize, as_device_type(identity),
+                as_device_type(tmp_storage.get_const_data()), 1,
+                as_device_type(result));
+        }
+    }
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
+                                    run_generic_col_reduction_small);
+
+
+}  // namespace
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto resources =
+        exec->get_num_warps() * config::warp_size * oversubscription;
+    if (rows * cols > resources && rows < cols) {
+        const auto col_blocks = ceildiv(rows * cols, resources);
+        Array<ValueType> partial{exec,
+                                 static_cast<size_type>(col_blocks * rows)};
+        const auto num_blocks =
+            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
+        // no need to guard this kernel, as rows * cols > resources
+        generic_kernel_row_reduction_2d<config::warp_size>
+            <<<num_blocks, default_block_size>>>(
+                rows, cols, col_blocks, fn, op,
+                [] __device__(auto v) { return v; }, as_device_type(identity),
+                as_device_type(partial.get_data()), 1, map_to_device(args)...);
+        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
+        generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
+                                               default_block_size>>>(
+            rows, col_blocks, op, finalize, as_device_type(identity),
+            as_device_type(partial.get_const_data()),
+            static_cast<int64>(result_stride), as_device_type(result));
+    } else {
+        select_run_generic_kernel_row_reduction(
+            subwarp_sizes(),
+            [cols](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
+            finalize, identity, result, static_cast<int64>(result_stride),
+            map_to_device(args)...);
+    }
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    using subwarp_sizes =
+        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    constexpr int oversubscription = 16;
+    const auto rows = static_cast<int64>(size[0]);
+    const auto cols = static_cast<int64>(size[1]);
+    const auto max_blocks = exec->get_num_warps() * config::warp_size *
+                            oversubscription / default_block_size;
+    if (cols <= config::warp_size) {
+        select_generic_col_reduction_small(
+            subwarp_sizes(),
+            [cols](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= cols ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
+            op, finalize, identity, result, size, map_to_device(args)...);
+    } else {
+        const auto col_blocks = ceildiv(cols, config::warp_size);
+        const auto row_blocks =
+            ceildiv(std::min<int64>(
+                        ceildiv(rows * config::warp_size, default_block_size),
+                        max_blocks),
+                    col_blocks);
+        if (row_blocks <= 1) {
+            generic_kernel_col_reduction_2d_blocked<<<dim3(1, col_blocks),
+                                                      default_block_size>>>(
+                rows, cols, fn, op, finalize, as_device_type(identity),
+                as_device_type(result), map_to_device(args)...);
+        } else {
+            Array<ValueType> tmp_storage{
+                exec, static_cast<size_type>(row_blocks * cols)};
+            // no need to guard this kernel, as cols > warp_size, row_blocks > 1
+            generic_kernel_col_reduction_2d_blocked<<<
+                dim3(row_blocks, col_blocks), default_block_size>>>(
+                rows, cols, fn, op, [] __device__(auto v) { return v; },
+                as_device_type(identity),
+                as_device_type(tmp_storage.get_data()), map_to_device(args)...);
+            generic_kernel_reduction_finalize_2d<<<
+                ceildiv(cols, default_block_size), default_block_size>>>(
+                cols, row_blocks, op, finalize, as_device_type(identity),
+                as_device_type(tmp_storage.get_const_data()), 1,
+                as_device_type(result));
+        }
+    }
+}
diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp.inc
new file mode 100644
index 00000000000..0d7a0cbce5d
--- /dev/null
+++ b/common/cuda_hip/base/kernel_launch_solver.hpp.inc
@@ -0,0 +1,61 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename KernelFunction, typename... KernelArgs>
+__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
+    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
+    KernelArgs... args)
+{
+    auto tidx = thread::get_thread_id_flat<int64>();
+    auto col = tidx % cols;
+    auto row = tidx / cols;
+    if (row >= rows) {
+        return;
+    }
+    fn(row, col,
+       device_unpack_solver_impl<KernelArgs>::unpack(args, default_stride)...);
+}
+
+
+template <typename KernelFunction, typename... KernelArgs>
+void run_kernel_solver(std::shared_ptr<const DefaultExecutor> exec,
+                       KernelFunction fn, dim<2> size, size_type default_stride,
+                       KernelArgs&&... args)
+{
+    if (size[0] > 0 && size[1] > 0) {
+        constexpr auto block_size = default_block_size;
+        auto num_blocks = ceildiv(size[0] * size[1], block_size);
+        generic_kernel_2d_solver<<<num_blocks, block_size>>>(
+            static_cast<int64>(size[0]), static_cast<int64>(size[1]),
+            static_cast<int64>(default_stride), fn, map_to_device(args)...);
+    }
+}
diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
index 749a9016282..257948b3d4d 100644
--- a/cuda/base/kernel_launch.cuh
+++ b/cuda/base/kernel_launch.cuh
@@ -54,56 +54,7 @@ namespace device_std = thrust;
 constexpr int default_block_size = 512;
 
 
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    int64 size, KernelFunction fn, KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    if (tidx >= size) {
-        return;
-    }
-    fn(tidx, args...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto col = tidx % cols;
-    auto row = tidx / cols;
-    if (row >= rows) {
-        return;
-    }
-    fn(row, col, args...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
-                size_type size, KernelArgs&&... args)
-{
-    if (size > 0) {
-        constexpr auto block_size = default_block_size;
-        auto num_blocks = ceildiv(size, block_size);
-        generic_kernel_1d<<<num_blocks, block_size>>>(
-            static_cast<int64>(size), fn, map_to_device(args)...);
-    }
-}
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
-                dim<2> size, KernelArgs&&... args)
-{
-    if (size[0] > 0 && size[1] > 0) {
-        constexpr auto block_size = default_block_size;
-        auto num_blocks = ceildiv(size[0] * size[1], block_size);
-        generic_kernel_2d<<<num_blocks, block_size>>>(
-            static_cast<int64>(size[0]), static_cast<int64>(size[1]), fn,
-            map_to_device(args)...);
-    }
-}
+#include "common/cuda_hip/base/kernel_launch.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index db7040b07f5..e6da25a1706 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -48,482 +48,7 @@ namespace kernels {
 namespace cuda {
 
 
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__ __launch_bounds__(
-    default_block_size) void generic_kernel_reduction_1d(int64 size,
-                                                         KernelFunction fn,
-                                                         ReductionOp op,
-                                                         FinalizeOp finalize,
-                                                         ValueType identity,
-                                                         ValueType* storage,
-                                                         KernelArgs... args)
-{
-    __shared__
-        UninitializedArray<ValueType, default_block_size / config::warp_size>
-            warp_partial;
-    static_assert(default_block_size / config::warp_size <= config::warp_size,
-                  "needs third reduction level");
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto grid_size = thread::get_thread_num_flat<int64>();
-    auto warp =
-        group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (int64 i = tidx; i < size; i += grid_size) {
-        partial = op(partial, fn(i, args...));
-    }
-    partial = reduce(warp, partial, op);
-    if (warp.thread_rank() == 0) {
-        warp_partial[threadIdx.x / config::warp_size] = partial;
-    }
-    __syncthreads();
-    if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp,
-                         threadIdx.x < default_block_size / config::warp_size
-                             ? warp_partial[threadIdx.x]
-                             : identity,
-                         op);
-        if (threadIdx.x == 0) {
-            storage[blockIdx.x] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__ __launch_bounds__(
-    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
-                                                         KernelFunction fn,
-                                                         ReductionOp op,
-                                                         FinalizeOp finalize,
-                                                         ValueType identity,
-                                                         ValueType* storage,
-                                                         KernelArgs... args)
-{
-    __shared__
-        UninitializedArray<ValueType, default_block_size / config::warp_size>
-            warp_partial;
-    static_assert(default_block_size / config::warp_size <= config::warp_size,
-                  "needs third reduction level");
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto grid_size = thread::get_thread_num_flat<int64>();
-    auto warp =
-        group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (int64 i = tidx; i < rows * cols; i += grid_size) {
-        const auto row = i / cols;
-        const auto col = i % cols;
-        partial = op(partial, fn(row, col, args...));
-    }
-    partial = reduce(warp, partial, op);
-    if (warp.thread_rank() == 0) {
-        warp_partial[threadIdx.x / config::warp_size] = partial;
-    }
-    __syncthreads();
-    if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp,
-                         threadIdx.x < default_block_size / config::warp_size
-                             ? warp_partial[threadIdx.x]
-                             : identity,
-                         op);
-        if (threadIdx.x == 0) {
-            storage[blockIdx.x] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
-{
-    constexpr int oversubscription = 16;
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = std::min<int64>(
-        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
-    if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
-        generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
-            static_cast<int64>(size), fn, op,
-            [] __device__(auto v) { return v; }, as_cuda_type(identity),
-            as_cuda_type(partial.get_data()), map_to_device(args)...);
-        generic_kernel_reduction_1d<<<1, block_size>>>(
-            static_cast<int64>(num_blocks),
-            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_cuda_type(identity), as_cuda_type(result),
-            as_cuda_type(partial.get_const_data()));
-    } else {
-        generic_kernel_reduction_1d<<<1, block_size>>>(
-            static_cast<int64>(size), fn, op, finalize, as_cuda_type(identity),
-            as_cuda_type(result), map_to_device(args)...);
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const CudaExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, dim<2> size, KernelArgs&&... args)
-{
-    constexpr int oversubscription = 16;
-    constexpr auto block_size = default_block_size;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto num_blocks =
-        std::min<int64>(ceildiv(rows * cols, block_size),
-                        exec->get_num_warps() * oversubscription);
-    if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
-        generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
-            rows, cols, fn, op, [] __device__(auto v) { return v; },
-            as_cuda_type(identity), as_cuda_type(partial.get_data()),
-            map_to_device(args)...);
-        generic_kernel_reduction_1d<<<1, block_size>>>(
-            static_cast<int64>(num_blocks),
-            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_cuda_type(identity), as_cuda_type(result),
-            as_cuda_type(partial.get_const_data()));
-    } else {
-        generic_kernel_reduction_2d<<<1, block_size>>>(
-            rows, cols, fn, op, finalize, as_cuda_type(identity),
-            as_cuda_type(result), map_to_device(args)...);
-    }
-}
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
-        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType identity,
-        ValueType* result, int64 result_stride, KernelArgs... args)
-{
-    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
-    const auto row = idx % rows;
-    const auto col_block = idx / rows;
-    if (col_block >= col_blocks) {
-        return;
-    }
-    const auto cols_per_part =
-        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
-    const auto begin = cols_per_part * col_block;
-    const auto end = min(begin + cols_per_part, cols);
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (auto col = begin + subwarp.thread_rank(); col < end;
-         col += subwarp_size) {
-        partial = op(partial, fn(row, col, args...));
-    }
-    partial = reduce(subwarp, partial, op);
-    if (subwarp.thread_rank() == 0) {
-        result[(row + col_block * rows) * result_stride] = finalize(partial);
-    }
-}
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
-{
-    constexpr auto warp_size = config::warp_size;
-    constexpr auto warps_per_block = default_block_size / warp_size;
-    // stores the subwarp_size partial sums from each warp, grouped by warp
-    constexpr auto shared_storage = warps_per_block * subwarp_size;
-    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
-    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
-    const auto local_warp_id = threadIdx.x / warp_size;
-    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
-    const auto subwarp_num =
-        thread::get_subwarp_num_flat<subwarp_size, int64>();
-    const auto block = group::this_thread_block();
-    const auto warp = group::tiled_partition<warp_size>(block);
-    const auto warp_rank = warp.thread_rank();
-    const auto subwarp_rank = warp_rank % subwarp_size;
-    const auto col = static_cast<int64>(subwarp_rank);
-    auto partial = identity;
-    // accumulate within a thread
-    if (col < cols) {
-        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
-            partial = op(partial, fn(row, col, args...));
-        }
-    }
-    // accumulate between all subwarps in the warp
-#pragma unroll
-    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
-        partial = op(partial, warp.shfl_xor(partial, i));
-    }  // store the result to shared memory
-    if (local_subwarp_id == 0) {
-        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
-    }
-    block.sync();
-    // in a single thread: accumulate the results
-    if (local_warp_id == 0) {
-        partial = identity;
-        // accumulate the partial results within a thread
-        if (shared_storage >= warp_size) {
-#pragma unroll
-            for (int i = 0; i < shared_storage; i += warp_size) {
-                partial = op(partial, block_partial[i + warp_rank]);
-            }
-        } else if (warp_rank < shared_storage) {
-            partial = op(partial, block_partial[warp_rank]);
-        }
-        // accumulate between all subwarps in the warp
-#pragma unroll
-        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
-            partial = op(partial, warp.shfl_xor(partial, i));
-        }
-        if (warp_rank < cols) {
-            result[warp_rank + blockIdx.x * cols] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
-{
-    constexpr auto warp_size = config::warp_size;
-    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
-    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
-    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
-    const auto block = group::this_thread_block();
-    const auto warp = group::tiled_partition<warp_size>(block);
-    const auto warp_rank = warp.thread_rank();
-    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
-    auto partial = identity;
-    // accumulate within a thread
-    if (col < cols) {
-        for (auto row = warp_id; row < rows; row += warp_num) {
-            partial = op(partial, fn(row, col, args...));
-        }
-    }
-    block_partial[threadIdx.x] = partial;
-    block.sync();
-    // in a single warp: accumulate the results
-    if (threadIdx.x < warp_size) {
-        partial = identity;
-        // accumulate the partial results within a thread
-#pragma unroll
-        for (int i = 0; i < default_block_size; i += warp_size) {
-            partial = op(partial, block_partial[i + warp_rank]);
-        }
-        if (col < cols) {
-            result[col + blockIdx.x * cols] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename ReductionOp, typename FinalizeOp>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
-        int64 num_results, int64 num_blocks, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, const ValueType* input,
-        int64 result_stride, ValueType* result)
-{
-    const auto idx = thread::get_thread_id_flat<int64>();
-    if (idx >= num_results) {
-        return;
-    }
-    auto partial = identity;
-    for (int64 block = 0; block < num_blocks; block++) {
-        partial = op(partial, input[idx + block * num_results]);
-    }
-    result[idx * result_stride] = finalize(partial);
-}
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
-                                      int64 rows, int64 cols, int64 col_blocks,
-                                      KernelFunction fn, ReductionOp op,
-                                      FinalizeOp finalize, ValueType identity,
-                                      ValueType* result, int64 result_stride,
-                                      KernelArgs... args)
-{
-    const auto num_blocks =
-        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
-    if (num_blocks > 0) {
-        generic_kernel_row_reduction_2d<subwarp_size>
-            <<<num_blocks, default_block_size>>>(
-                rows, cols, col_blocks, fn, op, finalize,
-                as_cuda_type(identity), as_cuda_type(result), result_stride,
-                args...);
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
-                                    run_generic_kernel_row_reduction);
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp,
-          typename... MappedKernelArgs>
-void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
-                                     int64 max_blocks,
-                                     std::shared_ptr<const CudaExecutor> exec,
-                                     KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType identity,
-                                     ValueType* result, dim<2> size,
-                                     MappedKernelArgs... args)
-{
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto num_blocks = std::min<int64>(
-        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
-    if (num_blocks <= 1) {
-        generic_kernel_col_reduction_2d_small<subwarp_size>
-            <<<1, default_block_size>>>(rows, cols, fn, op, finalize,
-                                        as_cuda_type(identity),
-                                        as_cuda_type(result), args...);
-    } else {
-        Array<ValueType> tmp_storage{exec,
-                                     static_cast<size_type>(num_blocks * cols)};
-        generic_kernel_col_reduction_2d_small<subwarp_size>
-            <<<num_blocks, default_block_size>>>(
-                rows, cols, fn, op, [] __device__(auto v) { return v; },
-                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
-                args...);
-        if (cols > 0) {
-            generic_kernel_reduction_finalize_2d<<<
-                ceildiv(cols, default_block_size), default_block_size>>>(
-                cols, num_blocks, op, finalize, as_cuda_type(identity),
-                as_cuda_type(tmp_storage.get_const_data()), 1,
-                as_cuda_type(result));
-        }
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
-                                    run_generic_col_reduction_small);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, KernelArgs&&... args)
-{
-    using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 16;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto resources =
-        exec->get_num_warps() * config::warp_size * oversubscription;
-    if (rows * cols > resources && rows < cols) {
-        const auto col_blocks = ceildiv(rows * cols, resources);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_blocks * rows)};
-        const auto num_blocks =
-            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
-        // no need to guard this kernel, as rows * cols > resources
-        generic_kernel_row_reduction_2d<config::warp_size>
-            <<<num_blocks, default_block_size>>>(
-                rows, cols, col_blocks, fn, op,
-                [] __device__(auto v) { return v; }, as_cuda_type(identity),
-                as_cuda_type(partial.get_data()), 1, map_to_device(args)...);
-        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
-        generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
-                                               default_block_size>>>(
-            rows, col_blocks, op, finalize, as_cuda_type(identity),
-            as_cuda_type(partial.get_const_data()),
-            static_cast<int64>(result_stride), as_cuda_type(result));
-    } else {
-        select_run_generic_kernel_row_reduction(
-            subwarp_sizes(),
-            [cols](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, identity, result, static_cast<int64>(result_stride),
-            map_to_device(args)...);
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, dim<2> size,
-                              KernelArgs&&... args)
-{
-    using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 16;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto max_blocks = exec->get_num_warps() * config::warp_size *
-                            oversubscription / default_block_size;
-    if (cols <= config::warp_size) {
-        select_generic_col_reduction_small(
-            subwarp_sizes(),
-            [cols](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, identity, result, size, map_to_device(args)...);
-    } else {
-        const auto col_blocks = ceildiv(cols, config::warp_size);
-        const auto row_blocks =
-            ceildiv(std::min<int64>(
-                        ceildiv(rows * config::warp_size, default_block_size),
-                        max_blocks),
-                    col_blocks);
-        if (row_blocks <= 1) {
-            generic_kernel_col_reduction_2d_blocked<<<dim3(1, col_blocks),
-                                                      default_block_size>>>(
-                rows, cols, fn, op, finalize, as_cuda_type(identity),
-                as_cuda_type(result), map_to_device(args)...);
-        } else {
-            Array<ValueType> tmp_storage{
-                exec, static_cast<size_type>(row_blocks * cols)};
-            // no need to guard this kernel, as cols > warp_size, row_blocks > 1
-            generic_kernel_col_reduction_2d_blocked<<<
-                dim3(row_blocks, col_blocks), default_block_size>>>(
-                rows, cols, fn, op, [] __device__(auto v) { return v; },
-                as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()),
-                map_to_device(args)...);
-            generic_kernel_reduction_finalize_2d<<<
-                ceildiv(cols, default_block_size), default_block_size>>>(
-                cols, row_blocks, op, finalize, as_cuda_type(identity),
-                as_cuda_type(tmp_storage.get_const_data()), 1,
-                as_cuda_type(result));
-        }
-    }
-}
+#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh
index 38dc0afffff..117c47a1102 100644
--- a/cuda/base/kernel_launch_solver.cuh
+++ b/cuda/base/kernel_launch_solver.cuh
@@ -41,35 +41,7 @@ namespace kernels {
 namespace cuda {
 
 
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
-    KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto col = tidx % cols;
-    auto row = tidx / cols;
-    if (row >= rows) {
-        return;
-    }
-    fn(row, col,
-       device_unpack_solver_impl<KernelArgs>::unpack(args, default_stride)...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel_solver(std::shared_ptr<const CudaExecutor> exec,
-                       KernelFunction fn, dim<2> size, size_type default_stride,
-                       KernelArgs&&... args)
-{
-    if (size[0] > 0 && size[1] > 0) {
-        constexpr auto block_size = default_block_size;
-        auto num_blocks = ceildiv(size[0] * size[1], block_size);
-        generic_kernel_2d_solver<<<num_blocks, block_size>>>(
-            static_cast<int64>(size[0]), static_cast<int64>(size[1]),
-            static_cast<int64>(default_stride), fn, map_to_device(args)...);
-    }
-}
+#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
 
 
 }  // namespace cuda
diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp
index 07c11a98d7f..7d5c0a33303 100644
--- a/hip/base/kernel_launch.hip.hpp
+++ b/hip/base/kernel_launch.hip.hpp
@@ -55,58 +55,7 @@ namespace device_std = thrust;
 constexpr int default_block_size = 512;
 
 
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    int64 size, KernelFunction fn, KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    if (tidx >= size) {
-        return;
-    }
-    fn(tidx, args...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto col = tidx % cols;
-    auto row = tidx / cols;
-    if (row >= rows) {
-        return;
-    }
-    fn(row, col, args...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
-                size_type size, KernelArgs&&... args)
-{
-    if (size > 0) {
-        constexpr auto block_size = default_block_size;
-        auto num_blocks = ceildiv(size, block_size);
-        hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0,
-                           static_cast<int64>(size), fn,
-                           map_to_device(args)...);
-    }
-}
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel(std::shared_ptr<const HipExecutor> exec, KernelFunction fn,
-                dim<2> size, KernelArgs&&... args)
-{
-    if (size[0] > 0 && size[1] > 0) {
-        constexpr auto block_size = default_block_size;
-        auto num_blocks = ceildiv(size[0] * size[1], block_size);
-        hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0,
-                           static_cast<int64>(size[0]),
-                           static_cast<int64>(size[1]), fn,
-                           map_to_device(args)...);
-    }
-}
+#include "common/cuda_hip/base/kernel_launch.hpp.inc"
 
 
 }  // namespace hip
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index e93f811bfb1..575d772855b 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -48,493 +48,7 @@ namespace kernels {
 namespace hip {
 
 
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__ __launch_bounds__(
-    default_block_size) void generic_kernel_reduction_1d(int64 size,
-                                                         KernelFunction fn,
-                                                         ReductionOp op,
-                                                         FinalizeOp finalize,
-                                                         ValueType identity,
-                                                         ValueType* storage,
-                                                         KernelArgs... args)
-{
-    __shared__
-        UninitializedArray<ValueType, default_block_size / config::warp_size>
-            warp_partial;
-    static_assert(default_block_size / config::warp_size <= config::warp_size,
-                  "needs third reduction level");
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto grid_size = thread::get_thread_num_flat<int64>();
-    auto warp =
-        group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (int64 i = tidx; i < size; i += grid_size) {
-        partial = op(partial, fn(i, args...));
-    }
-    partial = reduce(warp, partial, op);
-    if (warp.thread_rank() == 0) {
-        warp_partial[threadIdx.x / config::warp_size] = partial;
-    }
-    __syncthreads();
-    if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp,
-                         threadIdx.x < default_block_size / config::warp_size
-                             ? warp_partial[threadIdx.x]
-                             : identity,
-                         op);
-        if (threadIdx.x == 0) {
-            storage[blockIdx.x] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__ __launch_bounds__(
-    default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols,
-                                                         KernelFunction fn,
-                                                         ReductionOp op,
-                                                         FinalizeOp finalize,
-                                                         ValueType identity,
-                                                         ValueType* storage,
-                                                         KernelArgs... args)
-{
-    __shared__
-        UninitializedArray<ValueType, default_block_size / config::warp_size>
-            warp_partial;
-    static_assert(default_block_size / config::warp_size <= config::warp_size,
-                  "needs third reduction level");
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto grid_size = thread::get_thread_num_flat<int64>();
-    auto warp =
-        group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (int64 i = tidx; i < rows * cols; i += grid_size) {
-        const auto row = i / cols;
-        const auto col = i % cols;
-        partial = op(partial, fn(row, col, args...));
-    }
-    partial = reduce(warp, partial, op);
-    if (warp.thread_rank() == 0) {
-        warp_partial[threadIdx.x / config::warp_size] = partial;
-    }
-    __syncthreads();
-    if (threadIdx.x < config::warp_size) {
-        partial = reduce(warp,
-                         threadIdx.x < default_block_size / config::warp_size
-                             ? warp_partial[threadIdx.x]
-                             : identity,
-                         op);
-        if (threadIdx.x == 0) {
-            storage[blockIdx.x] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
-{
-    constexpr int oversubscription = 16;
-    constexpr auto block_size = default_block_size;
-    const auto num_blocks = std::min<int64>(
-        ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
-    if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
-        hipLaunchKernelGGL(
-            generic_kernel_reduction_1d, num_blocks, block_size, 0, 0,
-            static_cast<int64>(size), fn, op,
-            [] __device__(auto v) { return v; }, as_hip_type(identity),
-            as_hip_type(partial.get_data()), map_to_device(args)...);
-        hipLaunchKernelGGL(
-            generic_kernel_reduction_1d, 1, block_size, 0, 0,
-            static_cast<int64>(num_blocks),
-            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_hip_type(identity), as_hip_type(result),
-            as_hip_type(partial.get_const_data()));
-    } else {
-        hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0,
-                           static_cast<int64>(size), fn, op, finalize,
-                           as_hip_type(identity), as_hip_type(result),
-                           map_to_device(args)...);
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const HipExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, dim<2> size, KernelArgs&&... args)
-{
-    constexpr int oversubscription = 16;
-    constexpr auto block_size = default_block_size;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto num_blocks =
-        std::min<int64>(ceildiv(rows * cols, block_size),
-                        exec->get_num_warps() * oversubscription);
-    if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
-        hipLaunchKernelGGL(
-            generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows,
-            cols, fn, op, [] __device__(auto v) { return v; },
-            as_hip_type(identity), as_hip_type(partial.get_data()),
-            map_to_device(args)...);
-        hipLaunchKernelGGL(
-            generic_kernel_reduction_1d, 1, block_size, 0, 0,
-            static_cast<int64>(num_blocks),
-            [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
-            as_hip_type(identity), as_hip_type(result),
-            as_hip_type(partial.get_const_data()));
-    } else {
-        hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0,
-                           rows, cols, fn, op, finalize, as_hip_type(identity),
-                           as_hip_type(result), map_to_device(args)...);
-    }
-}
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d(
-        int64 rows, int64 cols, int64 col_blocks, KernelFunction fn,
-        ReductionOp op, FinalizeOp finalize, ValueType identity,
-        ValueType* result, int64 result_stride, KernelArgs... args)
-{
-    const auto idx = thread::get_subwarp_id_flat<subwarp_size, int64>();
-    const auto row = idx % rows;
-    const auto col_block = idx / rows;
-    if (col_block >= col_blocks) {
-        return;
-    }
-    const auto cols_per_part =
-        ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size;
-    const auto begin = cols_per_part * col_block;
-    const auto end = min(begin + cols_per_part, cols);
-    auto subwarp =
-        group::tiled_partition<subwarp_size>(group::this_thread_block());
-    auto partial = identity;
-    for (auto col = begin + subwarp.thread_rank(); col < end;
-         col += subwarp_size) {
-        partial = op(partial, fn(row, col, args...));
-    }
-    partial = reduce(subwarp, partial, op);
-    if (subwarp.thread_rank() == 0) {
-        result[(row + col_block * rows) * result_stride] = finalize(partial);
-    }
-}
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
-{
-    constexpr auto warp_size = config::warp_size;
-    constexpr auto warps_per_block = default_block_size / warp_size;
-    // stores the subwarp_size partial sums from each warp, grouped by warp
-    constexpr auto shared_storage = warps_per_block * subwarp_size;
-    __shared__ UninitializedArray<ValueType, shared_storage> block_partial;
-    const auto subwarp_id = thread::get_subwarp_id_flat<subwarp_size, int64>();
-    const auto local_warp_id = threadIdx.x / warp_size;
-    const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size;
-    const auto subwarp_num =
-        thread::get_subwarp_num_flat<subwarp_size, int64>();
-    const auto block = group::this_thread_block();
-    const auto warp = group::tiled_partition<warp_size>(block);
-    const auto warp_rank = warp.thread_rank();
-    const auto subwarp_rank = warp_rank % subwarp_size;
-    const auto col = static_cast<int64>(subwarp_rank);
-    auto partial = identity;
-    // accumulate within a thread
-    if (col < cols) {
-        for (auto row = subwarp_id; row < rows; row += subwarp_num) {
-            partial = op(partial, fn(row, col, args...));
-        }
-    }
-    // accumulate between all subwarps in the warp
-#pragma unroll
-    for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
-        partial = op(partial, warp.shfl_xor(partial, i));
-    }  // store the result to shared memory
-    if (local_subwarp_id == 0) {
-        block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial;
-    }
-    block.sync();
-    // in a single thread: accumulate the results
-    if (local_warp_id == 0) {
-        partial = identity;
-        // accumulate the partial results within a thread
-        if (shared_storage >= warp_size) {
-#pragma unroll
-            for (int i = 0; i < shared_storage; i += warp_size) {
-                partial = op(partial, block_partial[i + warp_rank]);
-            }
-        } else if (warp_rank < shared_storage) {
-            partial = op(partial, block_partial[warp_rank]);
-        }
-        // accumulate between all subwarps in the warp
-#pragma unroll
-        for (unsigned i = subwarp_size; i < warp_size; i *= 2) {
-            partial = op(partial, warp.shfl_xor(partial, i));
-        }
-        if (warp_rank < cols) {
-            result[warp_rank + blockIdx.x * cols] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked(
-        int64 rows, int64 cols, KernelFunction fn, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, ValueType* result,
-        KernelArgs... args)
-{
-    constexpr auto warp_size = config::warp_size;
-    __shared__ UninitializedArray<ValueType, default_block_size> block_partial;
-    const auto warp_id = thread::get_subwarp_id_flat<warp_size, int64>();
-    const auto warp_num = thread::get_subwarp_num_flat<warp_size, int64>();
-    const auto block = group::this_thread_block();
-    const auto warp = group::tiled_partition<warp_size>(block);
-    const auto warp_rank = warp.thread_rank();
-    const auto col = warp_rank + static_cast<int64>(blockIdx.y) * warp_size;
-    auto partial = identity;
-    // accumulate within a thread
-    if (col < cols) {
-        for (auto row = warp_id; row < rows; row += warp_num) {
-            partial = op(partial, fn(row, col, args...));
-        }
-    }
-    block_partial[threadIdx.x] = partial;
-    block.sync();
-    // in a single warp: accumulate the results
-    if (threadIdx.x < warp_size) {
-        partial = identity;
-        // accumulate the partial results within a thread
-#pragma unroll
-        for (int i = 0; i < default_block_size; i += warp_size) {
-            partial = op(partial, block_partial[i + warp_rank]);
-        }
-        if (col < cols) {
-            result[col + blockIdx.x * cols] = finalize(partial);
-        }
-    }
-}
-
-
-template <typename ValueType, typename ReductionOp, typename FinalizeOp>
-__global__
-    __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d(
-        int64 num_results, int64 num_blocks, ReductionOp op,
-        FinalizeOp finalize, ValueType identity, const ValueType* input,
-        int64 result_stride, ValueType* result)
-{
-    const auto idx = thread::get_thread_id_flat<int64>();
-    if (idx >= num_results) {
-        return;
-    }
-    auto partial = identity;
-    for (int64 block = 0; block < num_blocks; block++) {
-        partial = op(partial, input[idx + block * num_results]);
-    }
-    result[idx * result_stride] = finalize(partial);
-}
-
-
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
-                                      int64 rows, int64 cols, int64 col_blocks,
-                                      KernelFunction fn, ReductionOp op,
-                                      FinalizeOp finalize, ValueType identity,
-                                      ValueType* result, int64 result_stride,
-                                      KernelArgs... args)
-{
-    const auto num_blocks =
-        ceildiv(rows * col_blocks * subwarp_size, default_block_size);
-    if (num_blocks > 0) {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<subwarp_size>),
-            num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn,
-            op, finalize, as_hip_type(identity), as_hip_type(result),
-            result_stride, args...);
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
-                                    run_generic_kernel_row_reduction);
-
-
-template <int subwarp_size, typename ValueType, typename KernelFunction,
-          typename ReductionOp, typename FinalizeOp,
-          typename... MappedKernelArgs>
-void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
-                                     int64 max_blocks,
-                                     std::shared_ptr<const HipExecutor> exec,
-                                     KernelFunction fn, ReductionOp op,
-                                     FinalizeOp finalize, ValueType identity,
-                                     ValueType* result, dim<2> size,
-                                     MappedKernelArgs... args)
-{
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto num_blocks = std::min<int64>(
-        ceildiv(rows * subwarp_size, default_block_size), max_blocks);
-    if (num_blocks <= 1) {
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                generic_kernel_col_reduction_2d_small<subwarp_size>),
-            1, default_block_size, 0, 0, rows, cols, fn, op, finalize,
-            as_hip_type(identity), as_hip_type(result), args...);
-    } else {
-        Array<ValueType> tmp_storage{exec,
-                                     static_cast<size_type>(num_blocks * cols)};
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(
-                generic_kernel_col_reduction_2d_small<subwarp_size>),
-            num_blocks, default_block_size, 0, 0, rows, cols, fn, op,
-            [] __device__(auto v) { return v; }, as_hip_type(identity),
-            as_hip_type(tmp_storage.get_data()), args...);
-        if (cols > 0) {
-            hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d,
-                               ceildiv(cols, default_block_size),
-                               default_block_size, 0, 0, cols, num_blocks, op,
-                               finalize, as_hip_type(identity),
-                               as_hip_type(tmp_storage.get_const_data()), 1,
-                               as_hip_type(result));
-        }
-    }
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
-                                    run_generic_col_reduction_small);
-
-
-}  // namespace
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, KernelArgs&&... args)
-{
-    using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 16;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto resources =
-        exec->get_num_warps() * config::warp_size * oversubscription;
-    if (rows * cols > resources && rows < cols) {
-        const auto col_blocks = ceildiv(rows * cols, resources);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_blocks * rows)};
-        const auto num_blocks =
-            ceildiv(rows * col_blocks * config::warp_size, default_block_size);
-        // no need to guard this kernel, as rows * cols > resources
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(generic_kernel_row_reduction_2d<config::warp_size>),
-            num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn,
-            op, [] __device__(auto v) { return v; }, as_hip_type(identity),
-            as_hip_type(partial.get_data()), 1, map_to_device(args)...);
-        const auto num_finalize_blocks = ceildiv(rows, default_block_size);
-        hipLaunchKernelGGL(
-            generic_kernel_reduction_finalize_2d, num_finalize_blocks,
-            default_block_size, 0, 0, rows, col_blocks, op, finalize,
-            as_hip_type(identity), as_hip_type(partial.get_const_data()),
-            static_cast<int64>(result_stride), as_hip_type(result));
-    } else {
-        select_run_generic_kernel_row_reduction(
-            subwarp_sizes(),
-            [cols](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, identity, result, static_cast<int64>(result_stride),
-            map_to_device(args)...);
-    }
-}
-
-
-template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... KernelArgs>
-void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, dim<2> size,
-                              KernelArgs&&... args)
-{
-    using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
-    constexpr int oversubscription = 16;
-    const auto rows = static_cast<int64>(size[0]);
-    const auto cols = static_cast<int64>(size[1]);
-    const auto max_blocks = exec->get_num_warps() * config::warp_size *
-                            oversubscription / default_block_size;
-    if (cols <= config::warp_size) {
-        select_generic_col_reduction_small(
-            subwarp_sizes(),
-            [cols](int compiled_subwarp_size) {
-                return compiled_subwarp_size >= cols ||
-                       compiled_subwarp_size == config::warp_size;
-            },
-            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, identity, result, size, map_to_device(args)...);
-    } else {
-        const auto col_blocks = ceildiv(cols, config::warp_size);
-        const auto row_blocks =
-            ceildiv(std::min<int64>(
-                        ceildiv(rows * config::warp_size, default_block_size),
-                        max_blocks),
-                    col_blocks);
-        if (row_blocks <= 1) {
-            hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked,
-                               dim3(1, col_blocks), default_block_size, 0, 0,
-                               rows, cols, fn, op, finalize,
-                               as_hip_type(identity), as_hip_type(result),
-                               map_to_device(args)...);
-        } else {
-            Array<ValueType> tmp_storage{
-                exec, static_cast<size_type>(row_blocks * cols)};
-            // no need to guard this kernel, as cols > warp_size, row_blocks > 1
-            hipLaunchKernelGGL(
-                generic_kernel_col_reduction_2d_blocked,
-                dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows,
-                cols, fn, op, [] __device__(auto v) { return v; },
-                as_hip_type(identity), as_hip_type(tmp_storage.get_data()),
-                map_to_device(args)...);
-            hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d,
-                               ceildiv(cols, default_block_size),
-                               default_block_size, 0, 0, cols, row_blocks, op,
-                               finalize, as_hip_type(identity),
-                               as_hip_type(tmp_storage.get_const_data()), 1,
-                               as_hip_type(result));
-        }
-    }
-}
+#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc"
 
 
 }  // namespace hip
diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp
index adbe5046818..250c0f83074 100644
--- a/hip/base/kernel_launch_solver.hip.hpp
+++ b/hip/base/kernel_launch_solver.hip.hpp
@@ -44,37 +44,7 @@ namespace kernels {
 namespace hip {
 
 
-template <typename KernelFunction, typename... KernelArgs>
-__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver(
-    int64 rows, int64 cols, int64 default_stride, KernelFunction fn,
-    KernelArgs... args)
-{
-    auto tidx = thread::get_thread_id_flat<int64>();
-    auto col = tidx % cols;
-    auto row = tidx / cols;
-    if (row >= rows) {
-        return;
-    }
-    fn(row, col,
-       device_unpack_solver_impl<KernelArgs>::unpack(args, default_stride)...);
-}
-
-
-template <typename KernelFunction, typename... KernelArgs>
-void run_kernel_solver(std::shared_ptr<const HipExecutor> exec,
-                       KernelFunction fn, dim<2> size, size_type default_stride,
-                       KernelArgs&&... args)
-{
-    if (size[0] > 0 && size[1] > 0) {
-        constexpr auto block_size = kernels::hip::default_block_size;
-        auto num_blocks = ceildiv(size[0] * size[1], block_size);
-        hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks,
-                           block_size, 0, 0, static_cast<int64>(size[0]),
-                           static_cast<int64>(size[1]),
-                           static_cast<int64>(default_stride), fn,
-                           kernels::hip::map_to_device(args)...);
-    }
-}
+#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc"
 
 
 }  // namespace hip

From 73c67a4cee461014b60ad1f77b34386695973cf6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Mon, 21 Mar 2022 13:45:26 +0100
Subject: [PATCH 2/5] add Dense reduction kernels using tmp array

---
 .../base/kernel_launch_reduction.hpp.inc      | 109 +++++---
 .../unified/base/kernel_launch_reduction.hpp  |  67 +++++
 common/unified/matrix/dense_kernels.cpp       |  26 +-
 core/device_hooks/common_kernels.inc.cpp      |   3 -
 core/matrix/dense.cpp                         | 145 +++++++++-
 core/matrix/dense_kernels.hpp                 |  45 +--
 core/synthesizer/implementation_selection.hpp |   8 +-
 cuda/matrix/dense_kernels.cu                  |  70 -----
 cuda/solver/cb_gmres_kernels.cu               |   4 +-
 cuda/solver/gmres_kernels.cu                  |   3 +-
 cuda/test/base/kernel_launch.cu               | 100 +++++--
 dpcpp/base/kernel_launch_reduction.dp.hpp     | 130 +++++----
 dpcpp/matrix/dense_kernels.dp.cpp             |  41 ---
 dpcpp/solver/cb_gmres_kernels.dp.cpp          |   3 +-
 dpcpp/solver/gmres_kernels.dp.cpp             |   3 +-
 dpcpp/test/base/kernel_launch.dp.cpp          | 104 +++++--
 hip/matrix/dense_kernels.hip.cpp              |  70 -----
 hip/solver/cb_gmres_kernels.hip.cpp           |   3 +-
 hip/solver/gmres_kernels.hip.cpp              |   3 +-
 hip/test/base/kernel_launch.hip.cpp           | 256 ++++++++++++------
 include/ginkgo/core/matrix/dense.hpp          | 134 ++++-----
 omp/base/kernel_launch_reduction.hpp          | 127 +++++----
 omp/matrix/dense_kernels.cpp                  |  39 ---
 omp/test/base/kernel_launch.cpp               | 112 +++++---
 reference/matrix/dense_kernels.cpp            |  48 +---
 test/matrix/dense_kernels.cpp                 | 121 ++++++++-
 26 files changed, 1051 insertions(+), 723 deletions(-)

diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
index 28c9d8ba4fd..4ecf9c9979d 100644
--- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
@@ -118,27 +118,33 @@ __global__ __launch_bounds__(
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, size_type size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     constexpr int oversubscription = 16;
     constexpr auto block_size = default_block_size;
     const auto num_blocks = std::min<int64>(
         ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
     if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        const auto required_storage = sizeof(ValueType) * num_blocks;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         generic_kernel_reduction_1d<<<num_blocks, block_size>>>(
             static_cast<int64>(size), fn, op,
             [] __device__(auto v) { return v; }, as_device_type(identity),
-            as_device_type(partial.get_data()), map_to_device(args)...);
+            as_device_type(reinterpret_cast<ValueType*>(tmp.get_data())),
+            map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
             as_device_type(identity), as_device_type(result),
-            as_device_type(partial.get_const_data()));
+            as_device_type(
+                reinterpret_cast<const ValueType*>(tmp.get_const_data())));
     } else {
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(size), fn, op, finalize,
@@ -150,10 +156,11 @@ void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, dim<2> size, KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, dim<2> size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     constexpr int oversubscription = 16;
     constexpr auto block_size = default_block_size;
@@ -163,16 +170,22 @@ void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
         std::min<int64>(ceildiv(rows * cols, block_size),
                         exec->get_num_warps() * oversubscription);
     if (num_blocks > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_blocks)};
+        const auto required_storage = sizeof(ValueType) * num_blocks;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         generic_kernel_reduction_2d<<<num_blocks, block_size>>>(
             rows, cols, fn, op, [] __device__(auto v) { return v; },
-            as_device_type(identity), as_device_type(partial.get_data()),
+            as_device_type(identity),
+            as_device_type(reinterpret_cast<ValueType*>(tmp.get_data())),
             map_to_device(args)...);
         generic_kernel_reduction_1d<<<1, block_size>>>(
             static_cast<int64>(num_blocks),
             [] __device__(auto i, auto v) { return v[i]; }, op, finalize,
             as_device_type(identity), as_device_type(result),
-            as_device_type(partial.get_const_data()));
+            as_device_type(
+                reinterpret_cast<const ValueType*>(tmp.get_const_data())));
     } else {
         generic_kernel_reduction_2d<<<1, block_size>>>(
             rows, cols, fn, op, finalize, as_device_type(identity),
@@ -369,7 +382,7 @@ void run_generic_col_reduction_small(
     syn::value_list<int, subwarp_size>, int64 max_blocks,
     std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
     ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result,
-    dim<2> size, MappedKernelArgs... args)
+    dim<2> size, Array<char>& tmp, MappedKernelArgs... args)
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -381,19 +394,24 @@ void run_generic_col_reduction_small(
                                         as_device_type(identity),
                                         as_device_type(result), args...);
     } else {
-        Array<ValueType> tmp_storage{exec,
-                                     static_cast<size_type>(num_blocks * cols)};
+        const auto required_storage = sizeof(ValueType) * num_blocks * cols;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         generic_kernel_col_reduction_2d_small<subwarp_size>
             <<<num_blocks, default_block_size>>>(
                 rows, cols, fn, op, [] __device__(auto v) { return v; },
                 as_device_type(identity),
-                as_device_type(tmp_storage.get_data()), args...);
+                as_device_type(reinterpret_cast<ValueType*>(tmp.get_data())),
+                args...);
         if (cols > 0) {
             generic_kernel_reduction_finalize_2d<<<
                 ceildiv(cols, default_block_size), default_block_size>>>(
                 cols, num_blocks, op, finalize, as_device_type(identity),
-                as_device_type(tmp_storage.get_const_data()), 1,
-                as_device_type(result));
+                as_device_type(
+                    reinterpret_cast<const ValueType*>(tmp.get_const_data())),
+                1, as_device_type(result));
         }
     }
 }
@@ -407,11 +425,11 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, KernelArgs&&... args)
+void run_kernel_row_reduction_cached(
+    std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
+    ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result,
+    size_type result_stride, dim<2> size, Array<char>& tmp,
+    KernelArgs&&... args)
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
@@ -422,8 +440,11 @@ void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
         exec->get_num_warps() * config::warp_size * oversubscription;
     if (rows * cols > resources && rows < cols) {
         const auto col_blocks = ceildiv(rows * cols, resources);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_blocks * rows)};
+        const auto required_storage = sizeof(ValueType) * col_blocks * rows;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         const auto num_blocks =
             ceildiv(rows * col_blocks * config::warp_size, default_block_size);
         // no need to guard this kernel, as rows * cols > resources
@@ -431,12 +452,14 @@ void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
             <<<num_blocks, default_block_size>>>(
                 rows, cols, col_blocks, fn, op,
                 [] __device__(auto v) { return v; }, as_device_type(identity),
-                as_device_type(partial.get_data()), 1, map_to_device(args)...);
+                as_device_type(reinterpret_cast<ValueType*>(tmp.get_data())), 1,
+                map_to_device(args)...);
         const auto num_finalize_blocks = ceildiv(rows, default_block_size);
         generic_kernel_reduction_finalize_2d<<<num_finalize_blocks,
                                                default_block_size>>>(
             rows, col_blocks, op, finalize, as_device_type(identity),
-            as_device_type(partial.get_const_data()),
+            as_device_type(
+                reinterpret_cast<const ValueType*>(tmp.get_const_data())),
             static_cast<int64>(result_stride), as_device_type(result));
     } else {
         select_run_generic_kernel_row_reduction(
@@ -454,11 +477,10 @@ void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_col_reduction(std::shared_ptr<const DefaultExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, dim<2> size,
-                              KernelArgs&&... args)
+void run_kernel_col_reduction_cached(
+    std::shared_ptr<const DefaultExecutor> exec, KernelFunction fn,
+    ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result,
+    dim<2> size, Array<char>& tmp, KernelArgs&&... args)
 {
     using subwarp_sizes =
         syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
@@ -475,7 +497,7 @@ void run_kernel_col_reduction(std::shared_ptr<const DefaultExecutor> exec,
                        compiled_subwarp_size == config::warp_size;
             },
             syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, identity, result, size, map_to_device(args)...);
+            op, finalize, identity, result, size, tmp, map_to_device(args)...);
     } else {
         const auto col_blocks = ceildiv(cols, config::warp_size);
         const auto row_blocks =
@@ -489,19 +511,24 @@ void run_kernel_col_reduction(std::shared_ptr<const DefaultExecutor> exec,
                 rows, cols, fn, op, finalize, as_device_type(identity),
                 as_device_type(result), map_to_device(args)...);
         } else {
-            Array<ValueType> tmp_storage{
-                exec, static_cast<size_type>(row_blocks * cols)};
+            const auto required_storage = sizeof(ValueType) * row_blocks * cols;
+            tmp.set_executor(exec);
+            if (tmp.get_num_elems() < required_storage) {
+                tmp.resize_and_reset(required_storage);
+            }
             // no need to guard this kernel, as cols > warp_size, row_blocks > 1
             generic_kernel_col_reduction_2d_blocked<<<
                 dim3(row_blocks, col_blocks), default_block_size>>>(
                 rows, cols, fn, op, [] __device__(auto v) { return v; },
                 as_device_type(identity),
-                as_device_type(tmp_storage.get_data()), map_to_device(args)...);
+                as_device_type(reinterpret_cast<ValueType*>(tmp.get_data())),
+                map_to_device(args)...);
             generic_kernel_reduction_finalize_2d<<<
                 ceildiv(cols, default_block_size), default_block_size>>>(
                 cols, row_blocks, op, finalize, as_device_type(identity),
-                as_device_type(tmp_storage.get_const_data()), 1,
-                as_device_type(result));
+                as_device_type(
+                    reinterpret_cast<const ValueType*>(tmp.get_const_data())),
+                1, as_device_type(result));
         }
     }
 }
diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
index 3e64ddc2819..f70845efb5e 100644
--- a/common/unified/base/kernel_launch_reduction.hpp
+++ b/common/unified/base/kernel_launch_reduction.hpp
@@ -58,4 +58,71 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, size_type size,
+                          KernelArgs&&... args)
+{
+    Array<char> cache{exec};
+    run_kernel_reduction_cached(exec, fn, op, finalize, identity, result, size,
+                                cache, std::forward<KernelArgs>(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                          KernelFunction fn, ReductionOp op,
+                          FinalizeOp finalize, ValueType identity,
+                          ValueType* result, dim<2> size, KernelArgs&&... args)
+{
+    Array<char> cache{exec};
+    run_kernel_reduction_cached(exec, fn, op, finalize, identity, result, size,
+                                cache, std::forward<KernelArgs>(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, size_type result_stride,
+                              dim<2> size, KernelArgs&&... args)
+{
+    Array<char> cache{exec};
+    run_kernel_row_reduction_cached(exec, fn, op, finalize, identity, result,
+                                    result_stride, size, cache,
+                                    std::forward<KernelArgs>(args)...);
+}
+
+
+template <typename ValueType, typename KernelFunction, typename ReductionOp,
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_col_reduction(std::shared_ptr<const DefaultExecutor> exec,
+                              KernelFunction fn, ReductionOp op,
+                              FinalizeOp finalize, ValueType identity,
+                              ValueType* result, dim<2> size,
+                              KernelArgs&&... args)
+{
+    Array<char> cache{exec};
+    run_kernel_col_reduction_cached(exec, fn, op, finalize, identity, result,
+                                    size, cache,
+                                    std::forward<KernelArgs>(args)...);
+}
+
+
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
+
+
 #endif  // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
index 51a411daf3b..a328c5b56ad 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.cpp
@@ -246,15 +246,15 @@ template <typename ValueType>
 void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
                  const matrix::Dense<ValueType>* x,
                  const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
+                 matrix::Dense<ValueType>* result, Array<char>& tmp)
 {
-    run_kernel_col_reduction(
+    run_kernel_col_reduction_cached(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
             return x(i, j) * y(i, j);
         },
         GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        x, y);
+        tmp, x, y);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
@@ -264,15 +264,15 @@ template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* x,
                       const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
+                      matrix::Dense<ValueType>* result, Array<char>& tmp)
 {
-    run_kernel_col_reduction(
+    run_kernel_col_reduction_cached(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
             return conj(x(i, j)) * y(i, j);
         },
         GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        x, y);
+        tmp, x, y);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
@@ -281,14 +281,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
+                   matrix::Dense<remove_complex<ValueType>>* result,
+                   Array<char>& tmp)
 {
-    run_kernel_col_reduction(
+    run_kernel_col_reduction_cached(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
         [] GKO_KERNEL(auto a, auto b) { return a + b; },
         [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
-        result->get_values(), x->get_size(), x);
+        result->get_values(), x->get_size(), tmp, x);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
@@ -296,12 +297,13 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 template <typename ValueType>
 void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
                    const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
+                   matrix::Dense<remove_complex<ValueType>>* result,
+                   Array<char>& tmp)
 {
-    run_kernel_col_reduction(
+    run_kernel_col_reduction_cached(
         exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); },
         GKO_KERNEL_REDUCE_SUM(remove_complex<ValueType>), result->get_values(),
-        x->get_size(), x);
+        x->get_size(), tmp, x);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index a89d1f3c140..13dae15583f 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -256,11 +256,8 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 49f128a126d..afa0de1f0e9 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -77,12 +77,8 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
 GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
 GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
 GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
-GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
-GKO_REGISTER_OPERATION(compute_conj_dot_dispatch,
-                       dense::compute_conj_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
-GKO_REGISTER_OPERATION(compute_norm2_dispatch, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
 GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
@@ -154,6 +150,75 @@ void Dense<ValueType>::fill(const ValueType value)
 }
 
 
+template <typename ValueType>
+void Dense<ValueType>::scale(const LinOp* alpha)
+{
+    auto exec = this->get_executor();
+    this->scale_impl(make_temporary_clone(exec, alpha).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::inv_scale(const LinOp* alpha)
+{
+    auto exec = this->get_executor();
+    this->inv_scale_impl(make_temporary_clone(exec, alpha).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::add_scaled(const LinOp* alpha, const LinOp* b)
+{
+    auto exec = this->get_executor();
+    this->add_scaled_impl(make_temporary_clone(exec, alpha).get(),
+                          make_temporary_clone(exec, b).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::sub_scaled(const LinOp* alpha, const LinOp* b)
+{
+    auto exec = this->get_executor();
+    this->sub_scaled_impl(make_temporary_clone(exec, alpha).get(),
+                          make_temporary_clone(exec, b).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_dot(const LinOp* b, LinOp* result) const
+{
+    auto exec = this->get_executor();
+    this->compute_dot_impl(make_temporary_clone(exec, b).get(),
+                           make_temporary_output_clone(exec, result).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_conj_dot(const LinOp* b, LinOp* result) const
+{
+    auto exec = this->get_executor();
+    this->compute_conj_dot_impl(
+        make_temporary_clone(exec, b).get(),
+        make_temporary_output_clone(exec, result).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_norm2(LinOp* result) const
+{
+    auto exec = this->get_executor();
+    this->compute_norm2_impl(make_temporary_output_clone(exec, result).get());
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_norm1(LinOp* result) const
+{
+    auto exec = this->get_executor();
+    this->compute_norm1_impl(make_temporary_output_clone(exec, result).get());
+}
+
+
 template <typename ValueType>
 void Dense<ValueType>::inv_scale_impl(const LinOp* alpha)
 {
@@ -269,6 +334,22 @@ void Dense<ValueType>::sub_scaled_impl(const LinOp* alpha, const LinOp* b)
 }
 
 
+template <typename ValueType>
+void Dense<ValueType>::compute_dot(const LinOp* b, LinOp* result,
+                                   Array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, b);
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    auto local_b = make_temporary_clone(exec, b);
+    auto local_res = make_temporary_clone(exec, result);
+    auto dense_b = make_temporary_conversion<ValueType>(local_b.get());
+    auto dense_res = make_temporary_conversion<ValueType>(local_res.get());
+    exec->run(
+        dense::make_compute_dot(this, dense_b.get(), dense_res.get(), tmp));
+}
+
+
 template <typename ValueType>
 void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
 {
@@ -277,8 +358,25 @@ void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
+    Array<char> tmp{exec};
     exec->run(
-        dense::make_compute_dot_dispatch(this, dense_b.get(), dense_res.get()));
+        dense::make_compute_dot(this, dense_b.get(), dense_res.get(), tmp));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_conj_dot(const LinOp* b, LinOp* result,
+                                        Array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(this, b);
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    auto local_b = make_temporary_clone(exec, b);
+    auto local_res = make_temporary_clone(exec, result);
+    auto dense_b = make_temporary_conversion<ValueType>(local_b.get());
+    auto dense_res = make_temporary_conversion<ValueType>(local_res.get());
+    exec->run(dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get(),
+                                           tmp));
 }
 
 
@@ -291,8 +389,21 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(dense::make_compute_conj_dot_dispatch(this, dense_b.get(),
-                                                    dense_res.get()));
+    Array<char> tmp{exec};
+    exec->run(dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get(),
+                                           tmp));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_norm2(LinOp* result, Array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    auto local_result = make_temporary_clone(exec, result);
+    auto dense_res = make_temporary_conversion<remove_complex<ValueType>>(
+        local_result.get());
+    exec->run(dense::make_compute_norm2(this, dense_res.get(), tmp));
 }
 
 
@@ -303,9 +414,23 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm2_dispatch(this, dense_res.get()));
+    Array<char> tmp{exec};
+    exec->run(dense::make_compute_norm2(this, dense_res.get(), tmp));
+}
+
+
+template <typename ValueType>
+void Dense<ValueType>::compute_norm1(LinOp* result, Array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    auto local_result = make_temporary_clone(exec, result);
+    auto dense_res = make_temporary_conversion<remove_complex<ValueType>>(
+        local_result.get());
+    exec->run(dense::make_compute_norm1(this, dense_res.get(), tmp));
 }
 
+
 template <typename ValueType>
 void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
 {
@@ -313,9 +438,11 @@ void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm1(this, dense_res.get()));
+    Array<char> tmp{exec};
+    exec->run(dense::make_compute_norm1(this, dense_res.get(), tmp));
 }
 
+
 template <typename ValueType>
 void Dense<ValueType>::convert_to(Dense<ValueType>* result) const
 {
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index b5eded2ceb4..1ef142eb4d7 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -104,44 +104,29 @@ namespace kernels {
                          const matrix::Diagonal<_type>* x,            \
                          matrix::Dense<_type>* y)
 
-#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type)               \
-    void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
-                              const matrix::Dense<_type>* x,               \
-                              const matrix::Dense<_type>* y,               \
-                              matrix::Dense<_type>* result)
-
 #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type)               \
     void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
                      const matrix::Dense<_type>* x,               \
                      const matrix::Dense<_type>* y,               \
-                     matrix::Dense<_type>* result)
-
-#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type)     \
-    void compute_conj_dot_dispatch(                                   \
-        std::shared_ptr<const DefaultExecutor> exec,                  \
-        const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \
-        matrix::Dense<_type>* result)
+                     matrix::Dense<_type>* result, Array<char>& tmp)
 
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
                           const matrix::Dense<_type>* y,               \
-                          matrix::Dense<_type>* result)
-
-#define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type)               \
-    void compute_norm2(std::shared_ptr<const DefaultExecutor> exec, \
-                       const matrix::Dense<_type>* x,               \
-                       matrix::Dense<remove_complex<_type>>* result)
+                          matrix::Dense<_type>* result, Array<char>& tmp)
 
-#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type)               \
-    void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
-                                const matrix::Dense<_type>* x,               \
-                                matrix::Dense<remove_complex<_type>>* result)
+#define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type)                \
+    void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,  \
+                       const matrix::Dense<_type>* x,                \
+                       matrix::Dense<remove_complex<_type>>* result, \
+                       Array<char>& tmp)
 
-#define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type)               \
-    void compute_norm1(std::shared_ptr<const DefaultExecutor> exec, \
-                       const matrix::Dense<_type>* x,               \
-                       matrix::Dense<remove_complex<_type>>* result)
+#define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type)                \
+    void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,  \
+                       const matrix::Dense<_type>* x,                \
+                       matrix::Dense<remove_complex<_type>>* result, \
+                       Array<char>& tmp)
 
 #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec)         \
     void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,  \
@@ -325,16 +310,10 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
     template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);               \
-    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);          \
-    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
     template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
-    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp
index a78ac391d2e..a2cf335840e 100644
--- a/core/synthesizer/implementation_selection.hpp
+++ b/core/synthesizer/implementation_selection.hpp
@@ -50,7 +50,7 @@ namespace syn {
               typename... InferredArgs>                                      \
     inline void _name(::gko::syn::value_list<int>, Predicate,                \
                       ::gko::syn::value_list<int, IntArgs...>,               \
-                      ::gko::syn::type_list<TArgs...>, InferredArgs...)      \
+                      ::gko::syn::type_list<TArgs...>, InferredArgs&&...)    \
         GKO_KERNEL_NOT_FOUND;                                                \
                                                                              \
     template <int K, int... Rest, typename Predicate, int... IntArgs,        \
@@ -58,7 +58,7 @@ namespace syn {
     inline void _name(                                                       \
         ::gko::syn::value_list<int, K, Rest...>, Predicate is_eligible,      \
         ::gko::syn::value_list<int, IntArgs...> int_args,                    \
-        ::gko::syn::type_list<TArgs...> type_args, InferredArgs... args)     \
+        ::gko::syn::type_list<TArgs...> type_args, InferredArgs&&... args)   \
     {                                                                        \
         if (is_eligible(K)) {                                                \
             _callable<IntArgs..., TArgs...>(                                 \
@@ -81,7 +81,7 @@ namespace syn {
                       ::gko::syn::value_list<bool, BoolArgs...>,             \
                       ::gko::syn::value_list<int, IntArgs...>,               \
                       ::gko::syn::value_list<gko::size_type, SizeTArgs...>,  \
-                      ::gko::syn::type_list<TArgs...>, InferredArgs...)      \
+                      ::gko::syn::type_list<TArgs...>, InferredArgs&&...)    \
         GKO_KERNEL_NOT_FOUND;                                                \
                                                                              \
     template <std::uint32_t K, std::uint32_t... Rest, typename Predicate,    \
@@ -93,7 +93,7 @@ namespace syn {
         ::gko::syn::value_list<bool, BoolArgs...> bool_args,                 \
         ::gko::syn::value_list<int, IntArgs...> int_args,                    \
         ::gko::syn::value_list<gko::size_type, SizeTArgs...> size_args,      \
-        ::gko::syn::type_list<TArgs...> type_args, InferredArgs... args)     \
+        ::gko::syn::type_list<TArgs...> type_args, InferredArgs&&... args)   \
     {                                                                        \
         if (is_eligible(K)) {                                                \
             _callable<BoolArgs..., IntArgs..., SizeTArgs..., TArgs..., K>(   \
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 0d39cbb9fbc..063b0f069d3 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -73,76 +73,6 @@ constexpr int default_block_size = 512;
 #include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
 
 
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_stride(), y->get_const_values(), y->get_stride(),
-                        result->get_values());
-        } else {
-            compute_dot(exec, x, y, result);
-        }
-    } else {
-        compute_dot(exec, x, y, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                             x->get_stride(), y->get_const_values(),
-                             y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (x->get_size()[1] == 1) {
-        if (cublas::is_supported<ValueType>::value) {
-            auto handle = exec->get_cublas_handle();
-            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                          x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result);
-        }
-    } else {
-        compute_norm2(exec, x, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index f494fa760b8..7bee0750390 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -137,12 +137,14 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
     constexpr auto block_size = default_block_size;
     const auto stride_arnoldi = arnoldi_norm->get_stride();
 
+    Array<char> tmp{exec};
+
     initialize_2_1_kernel<block_size><<<grid_dim_1, block_dim>>>(
         residual->get_size()[0], residual->get_size()[1], krylov_dim,
         acc::as_cuda_range(krylov_bases),
         as_cuda_type(residual_norm_collection->get_values()),
         residual_norm_collection->get_stride());
-    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index 5e946868fdf..d228b47dd37 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -117,8 +117,9 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
                 default_block_size);
     const auto block_dim = default_block_size;
     constexpr auto block_size = default_block_size;
+    Array<char> tmp{exec};
 
-    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size);
     initialize_2_2_kernel<block_size><<<grid_dim_2, block_dim>>>(
diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu
index 647d42120ad..af1a3d471f9 100644
--- a/cuda/test/base/kernel_launch.cu
+++ b/cuda/test/base/kernel_launch.cu
@@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 using gko::dim;
 using gko::int64;
 using gko::size_type;
 using std::is_same;
 
 
+struct move_only_type {
+    move_only_type() {}
+
+    move_only_type(move_only_type&&) {}
+
+    move_only_type(const move_only_type&) = delete;
+};
+
+
+move_only_type move_only_val{};
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+template <>
+struct to_device_type_impl<move_only_type&> {
+    using type = int64;
+
+    static type map_to_device(move_only_type&) { return 0; }
+};
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
 class KernelLaunch : public ::testing::Test {
 protected:
     KernelLaunch()
@@ -105,12 +132,13 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, size_type dim, int* data)
 {
     gko::kernels::cuda::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d) {
+        [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i] = i;
         },
-        dim, data);
+        dim, data, move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1D)
@@ -125,17 +153,18 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
 {
     gko::kernels::cuda::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i] = i;
             } else {
                 d[i] = 0;
             }
         },
-        data.get_num_elems(), data, data.get_const_data());
+        data.get_num_elems(), data, data.get_const_data(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1DArray)
@@ -150,13 +179,14 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m)
 {
     gko::kernels::cuda::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
             static_assert(is_same<decltype(d_ptr), const double*>::value,
                           "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr;
             bool strides_correct = d.stride == 5 && d2.stride == 5;
             bool accessors_2d_correct =
@@ -171,7 +201,7 @@ void run1d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m)
             }
         },
         16, m, static_cast<const gko::matrix::Dense<>*>(m),
-        m->get_const_values());
+        m->get_const_values(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1DDense)
@@ -186,13 +216,14 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, int* data)
 {
     gko::kernels::cuda::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i + 4 * j] = 4 * i + j;
         },
-        dim<2>{4, 4}, data);
+        dim<2>{4, 4}, data, move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2D)
@@ -207,18 +238,19 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::Array<int>& data)
 {
     gko::kernels::cuda::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i + 4 * j] = 4 * i + j;
             } else {
                 d[i + 4 * j] = 0;
             }
         },
-        dim<2>{4, 4}, data, data.get_const_data());
+        dim<2>{4, 4}, data, data.get_const_data(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2DArray)
@@ -235,7 +267,7 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m1,
     gko::kernels::cuda::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
-                      auto d4, auto d2_ptr, auto d3_ptr) {
+                      auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
@@ -246,6 +278,7 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m1,
             static_assert(is_same<decltype(d4), double*>::value, "type");
             static_assert(is_same<decltype(d2_ptr), double*>::value, "type");
             static_assert(is_same<decltype(d3_ptr), double*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr &&
                                     d3.data == d2_ptr && d4 == d3_ptr;
             bool strides_correct =
@@ -266,7 +299,8 @@ void run2d(std::shared_ptr<gko::CudaExecutor> exec, gko::matrix::Dense<>* m1,
         dim<2>{4, 4}, m2->get_stride(), m1,
         static_cast<const gko::matrix::Dense<>*>(m1), m1->get_const_values(),
         gko::kernels::cuda::default_stride(m2),
-        gko::kernels::cuda::row_vector(m3), m2->get_values(), m3->get_values());
+        gko::kernels::cuda::row_vector(m3), m2->get_values(), m3->get_values(),
+        move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2DDense)
@@ -283,23 +317,25 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000}, output);
+        size_type{100000}, output, move_only_val);
 
     // 2 * sum i=0...99999 (i+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -311,7 +347,7 @@ void run1d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             static_assert(is_same<decltype(j), int64>::value, "value");
             return j * 2;
         },
-        int64{}, output.get_data(), size_type{100}, output);
+        int64{}, output.get_data(), size_type{100}, output, move_only_val);
 
     // 2 * sum i=0...99 (i+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
@@ -326,10 +362,11 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -341,17 +378,19 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             static_assert(is_same<decltype(j), int64>::value, "value");
             return j * 4;
         },
-        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output,
+        move_only_val);
 
     // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::cuda::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -363,7 +402,7 @@ void run2d_reduction(std::shared_ptr<gko::CudaExecutor> exec)
             static_assert(is_same<decltype(j), int64>::value, "value");
             return j * 4;
         },
-        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val);
 
     // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
@@ -391,10 +430,12 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
             gko::kernels::cuda::run_kernel_row_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j, auto a) {
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
                     static_assert(is_same<decltype(j), int64>::value, "index");
                     static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) {
@@ -409,7 +450,7 @@ void run2d_row_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                 int64{}, output.get_data(), 2,
                 gko::dim<2>{static_cast<size_type>(num_rows),
                             static_cast<size_type>(num_cols)},
-                output);
+                output, move_only_val);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }
@@ -440,10 +481,12 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 
             gko::kernels::cuda::run_kernel_col_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j, auto a) {
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
                     static_assert(is_same<decltype(j), int64>::value, "index");
                     static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) {
@@ -458,7 +501,7 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
                 int64{}, output.get_data(),
                 gko::dim<2>{static_cast<size_type>(num_rows),
                             static_cast<size_type>(num_cols)},
-                output);
+                output, move_only_val);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }
@@ -466,6 +509,3 @@ void run2d_col_reduction(std::shared_ptr<gko::CudaExecutor> exec)
 }
 
 TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
-
-
-}  // namespace
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 0df00f58c6e..330e5aa7f44 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -168,7 +168,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType identity,
                                ValueType* result, size_type size,
-                               MappedKernelArgs... args)
+                               Array<char>& tmp, MappedKernelArgs... args)
 {
     constexpr int oversubscription = 4;
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
@@ -178,18 +178,23 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                         exec->get_num_computing_units() * oversubscription);
     auto queue = exec->get_queue();
     if (num_workgroups > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        const auto required_storage = sizeof(ValueType) * num_workgroups;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(size), num_workgroups, fn, op,
-                [](auto v) { return v; }, identity, partial.get_data(),
-                args...);
+                [](auto v) { return v; }, identity,
+                reinterpret_cast<ValueType*>(tmp.get_data()), args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
                 [](auto i, auto v) { return v[i]; }, op, finalize, identity,
-                result, partial.get_const_data());
+                result,
+                reinterpret_cast<const ValueType*>(tmp.get_const_data()));
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
@@ -207,7 +212,7 @@ template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
 void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType identity,
-                               ValueType* result, dim<2> size,
+                               ValueType* result, dim<2> size, Array<char>& tmp,
                                MappedKernelArgs... args)
 {
     constexpr int oversubscription = 4;
@@ -221,18 +226,23 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
                         exec->get_num_computing_units() * oversubscription);
     auto queue = exec->get_queue();
     if (num_workgroups > 1) {
-        Array<ValueType> partial{exec, static_cast<size_type>(num_workgroups)};
+        const auto required_storage = sizeof(ValueType) * num_workgroups;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_2d<cfg>(
                 cgh, rows, cols, num_workgroups, fn, op,
-                [](auto v) { return v; }, identity, partial.get_data(),
-                args...);
+                [](auto v) { return v; }, identity,
+                reinterpret_cast<ValueType*>(tmp.get_data()), args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_1d<cfg>(
                 cgh, static_cast<int64>(num_workgroups), 1,
                 [](auto i, auto v) { return v[i]; }, op, finalize, identity,
-                result, partial.get_const_data());
+                result,
+                reinterpret_cast<const ValueType*>(tmp.get_const_data()));
         });
     } else {
         queue->submit([&](sycl::handler& cgh) {
@@ -249,10 +259,11 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, dim<2> size, KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, dim<2> size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
@@ -264,17 +275,17 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        finalize, identity, result, size, tmp, map_to_device(args)...);
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, size_type size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
@@ -286,7 +297,7 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        finalize, identity, result, size, tmp, map_to_device(args)...);
 }
 
 
@@ -501,7 +512,8 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
                                      int64 max_workgroups, KernelFunction fn,
                                      ReductionOp op, FinalizeOp finalize,
                                      ValueType identity, ValueType* result,
-                                     dim<2> size, MappedKernelArgs... args)
+                                     dim<2> size, Array<char>& tmp,
+                                     MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -518,17 +530,22 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
                 args...);
         });
     } else {
-        Array<ValueType> tmp_storage{exec,
-                                     static_cast<size_type>(row_blocks * cols)};
+        const auto required_storage = sizeof(ValueType) * row_blocks * cols;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_col_reduction_2d_small<cfg, ssg_size>(
                 cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; },
-                identity, tmp_storage.get_data(), args...);
+                identity, reinterpret_cast<ValueType*>(tmp.get_data()),
+                args...);
         });
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_finalize_2d(
                 cgh, cols, row_blocks, op, finalize, identity,
-                tmp_storage.get_const_data(), 1, result);
+                reinterpret_cast<const ValueType*>(tmp.get_const_data()), 1,
+                result);
         });
     }
 }
@@ -544,7 +561,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
                                      ValueType* result, size_type result_stride,
-                                     dim<2> size, MappedKernelArgs... args)
+                                     dim<2> size, Array<char>& tmp,
+                                     MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -559,17 +577,20 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     auto queue = exec->get_queue();
     if (rows * cols > resources && rows < cols) {
         const auto col_blocks = ceildiv(rows * cols, resources);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(col_blocks * rows)};
+        const auto required_storage = sizeof(ValueType) * col_blocks * rows;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
         generic_kernel_row_reduction_2d<cfg, sg_size>(
             syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
-            op, [](auto v) { return v; }, identity, partial.get_data(), 1,
-            args...);
+            op, [](auto v) { return v; }, identity,
+            reinterpret_cast<ValueType*>(tmp.get_data()), 1, args...);
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_finalize_2d(
                 cgh, rows, col_blocks, op, finalize, identity,
-                partial.get_const_data(), static_cast<int64>(result_stride),
-                result);
+                reinterpret_cast<const ValueType*>(tmp.get_const_data()),
+                static_cast<int64>(result_stride), result);
         });
     } else {
         select_generic_kernel_row_reduction_2d(
@@ -595,7 +616,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
-                                     MappedKernelArgs... args)
+                                     Array<char>& tmp, MappedKernelArgs... args)
 {
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
@@ -615,7 +636,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                        compiled_ssg_size == sg_size;
             },
             syn::value_list<int, cfg>(), syn::type_list<>(), exec, max_blocks,
-            fn, op, finalize, identity, result, size, args...);
+            fn, op, finalize, identity, result, size, tmp, args...);
     } else {
         const auto col_blocks = ceildiv(cols, sg_size);
         const auto row_blocks = ceildiv(
@@ -629,18 +650,22 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                     result, args...);
             });
         } else {
-            Array<ValueType> tmp_storage{
-                exec, static_cast<size_type>(row_blocks * cols)};
+            const auto required_storage = sizeof(ValueType) * row_blocks * cols;
+            tmp.set_executor(exec);
+            if (tmp.get_num_elems() < required_storage) {
+                tmp.resize_and_reset(required_storage);
+            }
             queue->submit([&](sycl::handler& cgh) {
                 generic_kernel_col_reduction_2d_blocked<cfg>(
                     cgh, rows, cols, row_blocks, col_blocks, fn, op,
-                    [](auto v) { return v; }, identity, tmp_storage.get_data(),
-                    args...);
+                    [](auto v) { return v; }, identity,
+                    reinterpret_cast<ValueType*>(tmp.get_data()), args...);
             });
             queue->submit([&](sycl::handler& cgh) {
                 generic_kernel_reduction_finalize_2d(
                     cgh, cols, row_blocks, op, finalize, identity,
-                    tmp_storage.get_const_data(), 1, result);
+                    reinterpret_cast<const ValueType*>(tmp.get_const_data()), 1,
+                    result);
             });
         }
     }
@@ -655,11 +680,12 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, KernelArgs&&... args)
+void run_kernel_row_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, size_type result_stride,
+                                     dim<2> size, Array<char>& tmp,
+                                     KernelArgs&&... args)
 {
     const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
@@ -671,18 +697,18 @@ void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, result_stride, size,
+        finalize, identity, result, result_stride, size, tmp,
         map_to_device(args)...);
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, dim<2> size,
-                              KernelArgs&&... args)
+void run_kernel_col_reduction_cached(std::shared_ptr<const DpcppExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     Array<char>& tmp, KernelArgs&&... args)
 {
     const auto desired_cfg = get_first_cfg(
         as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
@@ -694,7 +720,7 @@ void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
         [&](std::uint32_t cfg) { return cfg == desired_cfg; },
         syn::value_list<bool>(), syn::value_list<int>(),
         syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        finalize, identity, result, size, tmp, map_to_device(args)...);
 }
 
 
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 97c01e0fac5..39c2b0c54e4 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -189,47 +189,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose,
 }  // namespace kernel
 
 
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result)
-{
-    // TODO Add onemkl for single column ?
-    compute_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result)
-{
-    // TODO Add onemkl for single column ?
-    compute_conj_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result)
-{
-    // TODO Add onemkl for single column ?
-    compute_norm2(exec, x, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index dc94bbb92ff..5e29c7aa612 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -1006,13 +1006,14 @@ void initialize_2(std::shared_ptr<const DpcppExecutor> exec,
     const dim3 block_dim(default_block_size, 1, 1);
     constexpr auto block_size = default_block_size;
     const auto stride_arnoldi = arnoldi_norm->get_stride();
+    Array<char> tmp{exec};
 
     initialize_2_1_kernel<block_size>(
         grid_dim_1, block_dim, 0, exec->get_queue(), residual->get_size()[0],
         residual->get_size()[1], krylov_dim, krylov_bases,
         residual_norm_collection->get_values(),
         residual_norm_collection->get_stride());
-    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp
index 2fe0d63bcbf..5c68e1580d5 100644
--- a/dpcpp/solver/gmres_kernels.dp.cpp
+++ b/dpcpp/solver/gmres_kernels.dp.cpp
@@ -470,8 +470,9 @@ void initialize_2(std::shared_ptr<const DpcppExecutor> exec,
         1, 1);
     const dim3 block_dim(default_block_size, 1, 1);
     constexpr auto block_size = default_block_size;
+    Array<char> tmp{exec};
 
-    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1,
                           1);
diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp
index a0f2d58806b..6cad2c705f4 100644
--- a/dpcpp/test/base/kernel_launch.dp.cpp
+++ b/dpcpp/test/base/kernel_launch.dp.cpp
@@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 using gko::dim;
 using gko::int64;
 using gko::size_type;
 using std::is_same;
 
 
+struct move_only_type {
+    move_only_type() {}
+
+    move_only_type(move_only_type&&) {}
+
+    move_only_type(const move_only_type&) = delete;
+};
+
+
+move_only_type move_only_val{};
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+template <>
+struct to_device_type_impl<move_only_type&> {
+    using type = int64;
+
+    static type map_to_device(move_only_type&) { return 0; }
+};
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
 class KernelLaunch : public ::testing::Test {
 protected:
 #if GINKGO_DPCPP_SINGLE_MODE
@@ -111,12 +138,13 @@ TEST_F(KernelLaunch, Runs1D)
 {
     gko::kernels::dpcpp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d) {
+        [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i] = i;
         },
-        zero_array.get_num_elems(), zero_array.get_data());
+        zero_array.get_num_elems(), zero_array.get_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_array);
 }
@@ -126,17 +154,19 @@ TEST_F(KernelLaunch, Runs1DArray)
 {
     gko::kernels::dpcpp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i] = i;
             } else {
                 d[i] = 0;
             }
         },
-        zero_array.get_num_elems(), zero_array, zero_array.get_const_data());
+        zero_array.get_num_elems(), zero_array, zero_array.get_const_data(),
+        move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_array);
 }
@@ -146,7 +176,7 @@ TEST_F(KernelLaunch, Runs1DDense)
 {
     gko::kernels::dpcpp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
@@ -154,6 +184,7 @@ TEST_F(KernelLaunch, Runs1DDense)
                           "type");
             static_assert(is_same<decltype(d_ptr), const value_type*>::value,
                           "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr;
             bool strides_correct = d.stride == 5 && d2.stride == 5;
             bool accessors_2d_correct =
@@ -168,7 +199,7 @@ TEST_F(KernelLaunch, Runs1DDense)
             }
         },
         16, zero_dense2.get(), static_cast<const Mtx*>(zero_dense2.get()),
-        zero_dense2->get_const_values());
+        zero_dense2->get_const_values(), move_only_val);
 
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
@@ -178,13 +209,14 @@ TEST_F(KernelLaunch, Runs2D)
 {
     gko::kernels::dpcpp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i + 4 * j] = 4 * i + j;
         },
-        dim<2>{4, 4}, zero_array.get_data());
+        dim<2>{4, 4}, zero_array.get_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array);
 }
@@ -194,18 +226,19 @@ TEST_F(KernelLaunch, Runs2DArray)
 {
     gko::kernels::dpcpp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i + 4 * j] = 4 * i + j;
             } else {
                 d[i + 4 * j] = 0;
             }
         },
-        dim<2>{4, 4}, zero_array, zero_array.get_const_data());
+        dim<2>{4, 4}, zero_array, zero_array.get_const_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array);
 }
@@ -216,7 +249,7 @@ TEST_F(KernelLaunch, Runs2DDense)
     gko::kernels::dpcpp::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
-                      auto d4, auto d2_ptr, auto d3_ptr) {
+                      auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), value_type&>::value,
                           "type");
@@ -229,6 +262,7 @@ TEST_F(KernelLaunch, Runs2DDense)
                           "type");
             static_assert(is_same<decltype(d3_ptr), value_type*>::value,
                           "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr &&
                                     d3.data == d2_ptr && d4 == d3_ptr;
             bool strides_correct =
@@ -251,7 +285,7 @@ TEST_F(KernelLaunch, Runs2DDense)
         zero_dense2->get_const_values(),
         gko::kernels::dpcpp::default_stride(zero_dense.get()),
         gko::kernels::dpcpp::row_vector(vec_dense.get()),
-        zero_dense->get_values(), vec_dense->get_values());
+        zero_dense->get_values(), vec_dense->get_values(), move_only_val);
 
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
@@ -263,9 +297,10 @@ TEST_F(KernelLaunch, Reduction1D)
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -277,16 +312,17 @@ TEST_F(KernelLaunch, Reduction1D)
             static_assert(is_same<decltype(j), int64>::value, "j");
             return j * 2;
         },
-        int64{}, output.get_data(), size_type{100000}, output);
+        int64{}, output.get_data(), size_type{100000}, output, move_only_val);
 
     // 2 * sum i=0...99999 (i+1)
     EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -298,7 +334,7 @@ TEST_F(KernelLaunch, Reduction1D)
             static_assert(is_same<decltype(j), int64>::value, "j");
             return j * 2;
         },
-        int64{}, output.get_data(), size_type{100}, output);
+        int64{}, output.get_data(), size_type{100}, output, move_only_val);
 
     // 2 * sum i=0...99 (i+1)
     EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
@@ -311,9 +347,11 @@ TEST_F(KernelLaunch, Reduction2D)
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -325,16 +363,19 @@ TEST_F(KernelLaunch, Reduction2D)
             static_assert(is_same<decltype(j), int64>::value, "j");
             return j * 4;
         },
-        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output);
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output,
+        move_only_val);
 
     // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
     EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::dpcpp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) {
@@ -346,7 +387,7 @@ TEST_F(KernelLaunch, Reduction2D)
             static_assert(is_same<decltype(j), int64>::value, "j");
             return j * 4;
         },
-        int64{}, output.get_data(), gko::dim<2>{10, 10}, output);
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val);
 
     // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
@@ -372,9 +413,12 @@ TEST_F(KernelLaunch, ReductionRow2D)
 
             gko::kernels::dpcpp::run_kernel_row_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j, auto a) {
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
                     static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) { return i + j; },
@@ -382,7 +426,7 @@ TEST_F(KernelLaunch, ReductionRow2D)
                 output.get_data(), 2,
                 gko::dim<2>{static_cast<size_type>(num_rows),
                             static_cast<size_type>(num_cols)},
-                output);
+                output, move_only_val);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }
@@ -409,9 +453,12 @@ TEST_F(KernelLaunch, ReductionCol2D)
 
             gko::kernels::dpcpp::run_kernel_col_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j, auto a) {
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
                     static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) { return i + j; },
@@ -419,12 +466,9 @@ TEST_F(KernelLaunch, ReductionCol2D)
                 output.get_data(),
                 gko::dim<2>{static_cast<size_type>(num_rows),
                             static_cast<size_type>(num_cols)},
-                output);
+                output, move_only_val);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }
     }
 }
-
-
-}  // namespace
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index d883f415770..d193af9a4fb 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -76,76 +76,6 @@ constexpr int default_block_size = 512;
 #include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
 
 
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
-                         x->get_stride(), y->get_const_values(),
-                         y->get_stride(), result->get_values());
-        } else {
-            compute_dot(exec, x, y, result);
-        }
-    } else {
-        compute_dot(exec, x, y, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result)
-{
-    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
-                              x->get_stride(), y->get_const_values(),
-                              y->get_stride(), result->get_values());
-        } else {
-            compute_conj_dot(exec, x, y, result);
-        }
-    } else {
-        compute_conj_dot(exec, x, y, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result)
-{
-    if (x->get_size()[1] == 1) {
-        if (hipblas::is_supported<ValueType>::value) {
-            auto handle = exec->get_hipblas_handle();
-            hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
-                           x->get_stride(), result->get_values());
-        } else {
-            compute_norm2(exec, x, result);
-        }
-    } else {
-        compute_norm2(exec, x, result);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index 1ab7dc0f554..09f206d355b 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -139,13 +139,14 @@ void initialize_2(std::shared_ptr<const HipExecutor> exec,
     const auto block_dim = default_block_size;
     constexpr auto block_size = default_block_size;
     const auto stride_arnoldi = arnoldi_norm->get_stride();
+    Array<char> tmp{exec};
 
     hipLaunchKernelGGL(initialize_2_1_kernel<block_size>, grid_dim_1, block_dim,
                        0, 0, residual->get_size()[0], residual->get_size()[1],
                        krylov_dim, acc::as_hip_range(krylov_bases),
                        as_hip_type(residual_norm_collection->get_values()),
                        residual_norm_collection->get_stride());
-    kernels::hip::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp
index 5945385e732..6c4c405f0de 100644
--- a/hip/solver/gmres_kernels.hip.cpp
+++ b/hip/solver/gmres_kernels.hip.cpp
@@ -121,8 +121,9 @@ void initialize_2(std::shared_ptr<const HipExecutor> exec,
                 default_block_size);
     const auto block_dim = default_block_size;
     constexpr auto block_size = default_block_size;
+    Array<char> tmp{exec};
 
-    kernels::hip::dense::compute_norm2(exec, residual, residual_norm);
+    kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp);
 
     const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size);
     hipLaunchKernelGGL(
diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp
index 1ebf38cd1ba..e757cb6ef18 100644
--- a/hip/test/base/kernel_launch.hip.cpp
+++ b/hip/test/base/kernel_launch.hip.cpp
@@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 using gko::dim;
 using gko::int64;
 using gko::size_type;
 using std::is_same;
 
 
+struct move_only_type {
+    move_only_type() {}
+
+    move_only_type(move_only_type&&) {}
+
+    move_only_type(const move_only_type&) = delete;
+};
+
+
+move_only_type move_only_val{};
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+template <>
+struct to_device_type_impl<move_only_type&> {
+    using type = int64;
+
+    static type map_to_device(move_only_type&) { return 0; }
+};
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
+
+
 class KernelLaunch : public ::testing::Test {
 protected:
     KernelLaunch()
@@ -104,12 +131,13 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, size_type dim, int* data)
 {
     gko::kernels::hip::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d) {
+        [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i] = i;
         },
-        dim, data);
+        dim, data, move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1D)
@@ -124,17 +152,18 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
 {
     gko::kernels::hip::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i] = i;
             } else {
                 d[i] = 0;
             }
         },
-        data.get_num_elems(), data, data.get_const_data());
+        data.get_num_elems(), data, data.get_const_data(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1DArray)
@@ -149,13 +178,14 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m)
 {
     gko::kernels::hip::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
             static_assert(is_same<decltype(d_ptr), const double*>::value,
                           "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr;
             bool strides_correct = d.stride == 5 && d2.stride == 5;
             bool accessors_2d_correct =
@@ -170,7 +200,7 @@ void run1d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m)
             }
         },
         16, m, static_cast<const gko::matrix::Dense<>*>(m),
-        m->get_const_values());
+        m->get_const_values(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs1DDense)
@@ -185,13 +215,14 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, int* data)
 {
     gko::kernels::hip::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i + 4 * j] = 4 * i + j;
         },
-        dim<2>{4, 4}, data);
+        dim<2>{4, 4}, data, move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2D)
@@ -206,18 +237,19 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::Array<int>& data)
 {
     gko::kernels::hip::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i + 4 * j] = 4 * i + j;
             } else {
                 d[i + 4 * j] = 0;
             }
         },
-        dim<2>{4, 4}, data, data.get_const_data());
+        dim<2>{4, 4}, data, data.get_const_data(), move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2DArray)
@@ -234,7 +266,7 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m1,
     gko::kernels::hip::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
-                      auto d4, auto d2_ptr, auto d3_ptr) {
+                      auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
@@ -245,6 +277,7 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m1,
             static_assert(is_same<decltype(d4), double*>::value, "type");
             static_assert(is_same<decltype(d2_ptr), double*>::value, "type");
             static_assert(is_same<decltype(d3_ptr), double*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr &&
                                     d3.data == d2_ptr && d4 == d3_ptr;
             bool strides_correct =
@@ -265,7 +298,8 @@ void run2d(std::shared_ptr<gko::HipExecutor> exec, gko::matrix::Dense<>* m1,
         dim<2>{4, 4}, m2->get_stride(), m1,
         static_cast<const gko::matrix::Dense<>*>(m1), m1->get_const_values(),
         gko::kernels::hip::default_stride(m2),
-        gko::kernels::hip::row_vector(m3), m2->get_values(), m3->get_values());
+        gko::kernels::hip::row_vector(m3), m2->get_values(), m3->get_values(),
+        move_only_val);
 }
 
 TEST_F(KernelLaunch, Runs2DDense)
@@ -282,28 +316,37 @@ void run1d_reduction(std::shared_ptr<gko::HipExecutor> exec)
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000}, output);
+        size_type{100000}, output, move_only_val);
 
     // 2 * sum i=0...99999 (i+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 2;
+        },
+        int64{}, output.get_data(), size_type{100}, output, move_only_val);
 
     // 2 * sum i=0...99 (i+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL);
@@ -318,28 +361,47 @@ void run2d_reduction(std::shared_ptr<gko::HipExecutor> exec)
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{1000, 100}, output,
+        move_only_val);
 
     // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL);
 
     gko::kernels::hip::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{10, 10}, output);
+        [] GKO_KERNEL(auto i, auto j) {
+            static_assert(is_same<decltype(i), int64>::value, "a");
+            static_assert(is_same<decltype(i), int64>::value, "b");
+            return i + j;
+        },
+        [] GKO_KERNEL(auto j) {
+            static_assert(is_same<decltype(j), int64>::value, "value");
+            return j * 4;
+        },
+        int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val);
 
     // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1)
     ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL);
@@ -350,32 +412,48 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 void run2d_row_reduction(std::shared_ptr<gko::HipExecutor> exec)
 {
-    int num_rows = 1000;
-    int num_cols = 100;
-    gko::Array<int64> host_ref{exec->get_master(),
-                               static_cast<size_type>(2 * num_rows)};
-    std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
-    gko::Array<int64> output{exec, host_ref};
-    for (int i = 0; i < num_rows; i++) {
-        // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
-        // row i and storing it with stride 2
-        host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1);
-    }
-
-    gko::kernels::hip::run_kernel_row_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
-        gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)},
-        output);
+    for (auto num_rows : {0, 100, 1000, 10000}) {
+        for (auto num_cols : {0, 10, 100, 1000, 10000}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(2 * num_rows)};
+            std::fill_n(host_ref.get_data(), 2 * num_rows, 1234);
+            gko::Array<int64> output{exec, host_ref};
+            for (int64 i = 0; i < num_rows; i++) {
+                // we are computing 2 * sum {j=0, j<cols} (i+1)*(j+1) for each
+                // row i and storing it with stride 2
+                host_ref.get_data()[2 * i] =
+                    static_cast<int64>(num_cols) * (num_cols + 1) * (i + 1);
+            }
 
-    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+            gko::kernels::hip::run_kernel_row_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(), 2,
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output, move_only_val);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
 }
 
 TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
@@ -383,34 +461,50 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 
 void run2d_col_reduction(std::shared_ptr<gko::HipExecutor> exec)
 {
-    int num_rows = 1000;
-    int num_cols = 100;
-    gko::Array<int64> host_ref{exec->get_master(),
-                               static_cast<size_type>(num_cols)};
-    gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
-    for (int i = 0; i < num_cols; i++) {
-        // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
-        // column i
-        host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1);
-    }
-
-    gko::kernels::hip::run_kernel_col_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
-            static_assert(is_same<decltype(i), int64>::value, "index");
-            static_assert(is_same<decltype(a), int64*>::value, "value");
-            return (i + 1) * (j + 1);
-        },
-        [] GKO_KERNEL(auto i, auto j) { return i + j; },
-        [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        gko::dim<2>{static_cast<size_type>(num_rows),
-                    static_cast<size_type>(num_cols)},
-        output);
+    // empty, most threads idle, most threads busy, multiple blocks
+    for (auto num_rows : {0, 10, 100, 1000, 10000}) {
+        // check different edge cases: subwarp sizes, blocked mode
+        for (auto num_cols :
+             {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) {
+            SCOPED_TRACE(std::to_string(num_rows) + " rows, " +
+                         std::to_string(num_cols) + " cols");
+            gko::Array<int64> host_ref{exec->get_master(),
+                                       static_cast<size_type>(num_cols)};
+            gko::Array<int64> output{exec, static_cast<size_type>(num_cols)};
+            for (int64 i = 0; i < num_cols; i++) {
+                // we are computing 2 * sum {j=0, j<row} (i+1)*(j+1) for each
+                // column i
+                host_ref.get_data()[i] =
+                    static_cast<int64>(num_rows) * (num_rows + 1) * (i + 1);
+            }
 
-    GKO_ASSERT_ARRAY_EQ(host_ref, output);
+            gko::kernels::hip::run_kernel_col_reduction(
+                exec,
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
+                    static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
+                    static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
+                    return (i + 1) * (j + 1);
+                },
+                [] GKO_KERNEL(auto i, auto j) {
+                    static_assert(is_same<decltype(i), int64>::value, "a");
+                    static_assert(is_same<decltype(i), int64>::value, "b");
+                    return i + j;
+                },
+                [] GKO_KERNEL(auto j) {
+                    static_assert(is_same<decltype(j), int64>::value, "value");
+                    return j * 2;
+                },
+                int64{}, output.get_data(),
+                gko::dim<2>{static_cast<size_type>(num_rows),
+                            static_cast<size_type>(num_cols)},
+                output, move_only_val);
+
+            GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        }
+    }
 }
 
 TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
-
-
-}  // namespace
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 2b3d17fee13..210c67941a8 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -693,11 +693,7 @@ class Dense
      *               element of alpha (the number of columns of alpha has to
      *               match the number of columns of the matrix).
      */
-    void scale(const LinOp* alpha)
-    {
-        auto exec = this->get_executor();
-        this->scale_impl(make_temporary_clone(exec, alpha).get());
-    }
+    void scale(const LinOp* alpha);
 
     /**
      * Scales the matrix with the inverse of a scalar.
@@ -708,11 +704,7 @@ class Dense
      *               of the i-th element of alpha (the number of columns of
      *               alpha has to match the number of columns of the matrix).
      */
-    void inv_scale(const LinOp* alpha)
-    {
-        auto exec = this->get_executor();
-        this->inv_scale_impl(make_temporary_clone(exec, alpha).get());
-    }
+    void inv_scale(const LinOp* alpha);
 
     /**
      * Adds `b` scaled by `alpha` to the matrix (aka: BLAS axpy).
@@ -724,12 +716,7 @@ class Dense
      *               match the number of columns of the matrix).
      * @param b  a matrix of the same dimension as this
      */
-    void add_scaled(const LinOp* alpha, const LinOp* b)
-    {
-        auto exec = this->get_executor();
-        this->add_scaled_impl(make_temporary_clone(exec, alpha).get(),
-                              make_temporary_clone(exec, b).get());
-    }
+    void add_scaled(const LinOp* alpha, const LinOp* b);
 
     /**
      * Subtracts `b` scaled by `alpha` fron the matrix (aka: BLAS axpy).
@@ -741,12 +728,7 @@ class Dense
      *               match the number of columns of the matrix).
      * @param b  a matrix of the same dimension as this
      */
-    void sub_scaled(const LinOp* alpha, const LinOp* b)
-    {
-        auto exec = this->get_executor();
-        this->sub_scaled_impl(make_temporary_clone(exec, alpha).get(),
-                              make_temporary_clone(exec, b).get());
-    }
+    void sub_scaled(const LinOp* alpha, const LinOp* b);
 
     /**
      * Computes the column-wise dot product of this matrix and `b`.
@@ -756,12 +738,17 @@ class Dense
      *                (the number of column in the vector must match the number
      *                of columns of this)
      */
-    void compute_dot(const LinOp* b, LinOp* result) const
-    {
-        auto exec = this->get_executor();
-        this->compute_dot_impl(make_temporary_clone(exec, b).get(),
-                               make_temporary_output_clone(exec, result).get());
-    }
+    void compute_dot(const LinOp* b, LinOp* result) const;
+
+    /**
+     * Computes the column-wise dot product of this matrix and `b`.
+     *
+     * @param b  a Dense matrix of same dimension as this
+     * @param result  a Dense row vector, used to store the dot product
+     *                (the number of column in the vector must match the number
+     *                of columns of this)
+     */
+    void compute_dot(const LinOp* b, LinOp* result, Array<char>& tmp) const;
 
     /**
      * Computes the column-wise dot product of `conj(this matrix)` and `b`.
@@ -771,13 +758,18 @@ class Dense
      *                (the number of column in the vector must match the number
      *                of columns of this)
      */
-    void compute_conj_dot(const LinOp* b, LinOp* result) const
-    {
-        auto exec = this->get_executor();
-        this->compute_conj_dot_impl(
-            make_temporary_clone(exec, b).get(),
-            make_temporary_output_clone(exec, result).get());
-    }
+    void compute_conj_dot(const LinOp* b, LinOp* result) const;
+
+    /**
+     * Computes the column-wise dot product of `conj(this matrix)` and `b`.
+     *
+     * @param b  a Dense matrix of same dimension as this
+     * @param result  a Dense row vector, used to store the dot product
+     *                (the number of column in the vector must match the number
+     *                of columns of this)
+     */
+    void compute_conj_dot(const LinOp* b, LinOp* result,
+                          Array<char>& tmp) const;
 
     /**
      * Computes the column-wise Euclidian (L^2) norm of this matrix.
@@ -786,12 +778,16 @@ class Dense
      *                (the number of columns in the vector must match the number
      *                of columns of this)
      */
-    void compute_norm2(LinOp* result) const
-    {
-        auto exec = this->get_executor();
-        this->compute_norm2_impl(
-            make_temporary_output_clone(exec, result).get());
-    }
+    void compute_norm2(LinOp* result) const;
+
+    /**
+     * Computes the column-wise Euclidian (L^2) norm of this matrix.
+     *
+     * @param result  a Dense row vector, used to store the norm
+     *                (the number of columns in the vector must match the
+     * number of columns of this)
+     */
+    void compute_norm2(LinOp* result, Array<char>& tmp) const;
 
     /**
      * Computes the column-wise (L^1) norm of this matrix.
@@ -800,12 +796,16 @@ class Dense
      *                (the number of columns in the vector must match the number
      *                of columns of this)
      */
-    void compute_norm1(LinOp* result) const
-    {
-        auto exec = this->get_executor();
-        this->compute_norm1_impl(
-            make_temporary_output_clone(exec, result).get());
-    }
+    void compute_norm1(LinOp* result) const;
+
+    /**
+     * Computes the column-wise (L^1) norm of this matrix.
+     *
+     * @param result  a Dense row vector, used to store the norm
+     *                (the number of columns in the vector must match the
+     * number of columns of this)
+     */
+    void compute_norm1(LinOp* result, Array<char>& tmp) const;
 
     /**
      * Create a submatrix from the original matrix.
@@ -1012,64 +1012,64 @@ class Dense
     /**
      * @copydoc scale(const LinOp *)
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of scale(const LinOp *alpha).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void scale_impl(const LinOp* alpha);
 
     /**
      * @copydoc inv_scale(const LinOp *)
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of inv_scale(const LinOp *alpha).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void inv_scale_impl(const LinOp* alpha);
 
     /**
      * @copydoc add_scaled(const LinOp *, const LinOp *)
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of add_scale(const LinOp *alpha, const LinOp *b).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void add_scaled_impl(const LinOp* alpha, const LinOp* b);
 
     /**
      * @copydoc sub_scaled(const LinOp *, const LinOp *)
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of sub_scale(const LinOp *alpha, const LinOp *b).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void sub_scaled_impl(const LinOp* alpha, const LinOp* b);
 
     /**
-     * @copydoc compute_dot(const LinOp *, LinOp *) const
+     * @copydoc compute_dot(const LinOp*, LinOp*) const
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of compute_dot(const LinOp *b, LinOp *result).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void compute_dot_impl(const LinOp* b, LinOp* result) const;
 
     /**
-     * @copydoc compute_conj_dot(const LinOp *, LinOp *) const
+     * @copydoc compute_conj_dot(const LinOp*, LinOp*) const
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of compute_conj_dot(const LinOp *b, LinOp *result).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void compute_conj_dot_impl(const LinOp* b, LinOp* result) const;
 
     /**
-     * @copydoc compute_norm2(LinOp *) const
+     * @copydoc compute_norm2(LinOp*) const
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of compute_norm2(LinOp *result).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void compute_norm2_impl(LinOp* result) const;
 
     /**
-     * @copydoc compute_norm1(LinOp *) const
+     * @copydoc compute_norm1(LinOp*) const
      *
-     * @note  Other implementations of dense should override this function
-     *        instead of compute_norm1(LinOp *result).
+     * @deprecated  This function will be removed in the future,
+     *              we will instead always use Ginkgo's implementation.
      */
     virtual void compute_norm1_impl(LinOp* result) const;
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index d4a489a258f..55046cae832 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -60,12 +60,17 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                KernelFunction fn, ReductionOp op,
                                FinalizeOp finalize, ValueType identity,
                                ValueType* result, size_type size,
-                               MappedKernelArgs... args)
+                               Array<char>& tmp, MappedKernelArgs... args)
 {
     const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto ssize = static_cast<int64>(size);
     const auto work_per_thread = ceildiv(ssize, num_threads);
-    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+    const auto required_storage = sizeof(ValueType) * num_threads;
+    tmp.set_executor(exec);
+    if (tmp.get_num_elems() < required_storage) {
+        tmp.resize_and_reset(required_storage);
+    }
+    const auto partial = reinterpret_cast<ValueType*>(tmp.get_data());
 #pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
@@ -76,11 +81,10 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         for (auto i = begin; i < end; i++) {
             local_partial = op(local_partial, fn(i, map_to_device(args)...));
         }
-        partial.get_data()[thread_id] = local_partial;
+        partial[thread_id] = local_partial;
     }
-    *result = finalize(std::accumulate(partial.get_const_data(),
-                                       partial.get_const_data() + num_threads,
-                                       identity, op));
+    *result =
+        finalize(std::accumulate(partial, partial + num_threads, identity, op));
 }
 
 
@@ -92,13 +96,18 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
                                      ValueType* result, dim<2> size,
-                                     MappedKernelArgs... args)
+                                     Array<char>& tmp, MappedKernelArgs... args)
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
     const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto work_per_thread = ceildiv(rows, num_threads);
-    Array<ValueType> partial{exec, static_cast<size_type>(num_threads)};
+    const auto required_storage = sizeof(ValueType) * num_threads;
+    tmp.set_executor(exec);
+    if (tmp.get_num_elems() < required_storage) {
+        tmp.resize_and_reset(required_storage);
+    }
+    const auto partial = reinterpret_cast<ValueType*>(tmp.get_data());
     static_assert(remainder_cols < block_size, "remainder too large");
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
@@ -138,11 +147,10 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
                 }
             }
         }
-        partial.get_data()[thread_id] = local_partial;
+        partial[thread_id] = local_partial;
     }
-    *result = finalize(std::accumulate(partial.get_const_data(),
-                                       partial.get_const_data() + num_threads,
-                                       identity, op));
+    *result =
+        finalize(std::accumulate(partial, partial + num_threads, identity, op));
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
@@ -154,23 +162,24 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, size_type size,
-                          KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const OmpExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, size_type size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size,
-                              map_to_device(args)...);
+                              tmp, map_to_device(args)...);
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
-                          KernelFunction fn, ReductionOp op,
-                          FinalizeOp finalize, ValueType identity,
-                          ValueType* result, dim<2> size, KernelArgs&&... args)
+void run_kernel_reduction_cached(std::shared_ptr<const OmpExecutor> exec,
+                                 KernelFunction fn, ReductionOp op,
+                                 FinalizeOp finalize, ValueType identity,
+                                 ValueType* result, dim<2> size,
+                                 Array<char>& tmp, KernelArgs&&... args)
 {
     const auto cols = static_cast<int64>(size[1]);
     constexpr int block_size = 8;
@@ -184,7 +193,7 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        finalize, identity, result, size, tmp, map_to_device(args)...);
 }
 
 
@@ -197,7 +206,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                    KernelFunction fn, ReductionOp op,
                                    FinalizeOp finalize, ValueType identity,
                                    ValueType* result, size_type result_stride,
-                                   dim<2> size, MappedKernelArgs... args)
+                                   dim<2> size, Array<char>& tmp,
+                                   MappedKernelArgs... args)
 {
     constexpr int block_size = 8;
     const auto rows = static_cast<int64>(size[0]);
@@ -222,8 +232,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     } else {
         // small number of rows and large reduction sizes: do partial sum first
         const auto work_per_thread = ceildiv(cols, num_threads);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(rows * num_threads)};
+        const auto required_storage = sizeof(ValueType) * rows * num_threads;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
+        const auto partial = reinterpret_cast<ValueType*>(tmp.get_data());
 #pragma omp parallel num_threads(num_threads)
         {
             const auto thread_id = static_cast<int64>(omp_get_thread_num());
@@ -236,8 +250,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                         return fn(row, col, args...);
                     }());
                 }
-                partial.get_data()[row * num_threads + thread_id] =
-                    local_partial;
+                partial[row * num_threads + thread_id] = local_partial;
             }
         }
         // then accumulate the partial sums and write to result
@@ -247,10 +260,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                 auto local_partial = identity;
                 for (int64 thread_id = 0; thread_id < num_threads;
                      thread_id++) {
-                    local_partial = op(
-                        local_partial,
-                        partial
-                            .get_const_data()[row * num_threads + thread_id]);
+                    local_partial = op(local_partial,
+                                       partial[row * num_threads + thread_id]);
                 }
                 result[row * result_stride] = finalize(local_partial);
             }();
@@ -290,7 +301,7 @@ void run_kernel_col_reduction_sized_impl(
     syn::value_list<int, remainder_cols>,
     std::shared_ptr<const OmpExecutor> exec, KernelFunction fn, ReductionOp op,
     FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size,
-    MappedKernelArgs... args)
+    Array<char>& tmp, MappedKernelArgs... args)
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -319,8 +330,12 @@ void run_kernel_col_reduction_sized_impl(
         const auto reduction_size =
             ceildiv(reduction_kernel_oversubscription * num_threads, cols);
         const auto rows_per_thread = ceildiv(rows, reduction_size);
-        Array<ValueType> partial{exec,
-                                 static_cast<size_type>(reduction_size * cols)};
+        const auto required_storage = sizeof(ValueType) * rows * reduction_size;
+        tmp.set_executor(exec);
+        if (tmp.get_num_elems() < required_storage) {
+            tmp.resize_and_reset(required_storage);
+        }
+        const auto partial = reinterpret_cast<ValueType*>(tmp.get_data());
 #pragma omp parallel for
         for (int64 i = 0; i < reduction_size * num_col_blocks; i++) {
             const auto col_block = i % num_col_blocks;
@@ -331,14 +346,12 @@ void run_kernel_col_reduction_sized_impl(
             const auto identity_fn = [](auto i) { return i; };
             if (base_col + block_size <= cols) {
                 run_kernel_col_reduction_sized_block_impl<block_size>(
-                    fn, op, identity_fn, identity,
-                    partial.get_data() + cols * row_block, begin, end, base_col,
-                    args...);
+                    fn, op, identity_fn, identity, partial + cols * row_block,
+                    begin, end, base_col, args...);
             } else {
                 run_kernel_col_reduction_sized_block_impl<remainder_cols>(
-                    fn, op, identity_fn, identity,
-                    partial.get_data() + cols * row_block, begin, end, base_col,
-                    args...);
+                    fn, op, identity_fn, identity, partial + cols * row_block,
+                    begin, end, base_col, args...);
             }
         }
 #pragma omp parallel for
@@ -347,9 +360,7 @@ void run_kernel_col_reduction_sized_impl(
                 auto total = identity;
                 for (int64 row_block = 0; row_block < reduction_size;
                      row_block++) {
-                    total =
-                        op(total,
-                           partial.get_const_data()[col + cols * row_block]);
+                    total = op(total, partial[col + cols * row_block]);
                 }
                 result[col] = finalize(total);
             }();
@@ -365,25 +376,27 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized,
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
-          typename FinalizeOp, typename... MappedKernelArgs>
-void run_kernel_row_reduction(std::shared_ptr<const OmpExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, size_type result_stride,
-                              dim<2> size, MappedKernelArgs... args)
+          typename FinalizeOp, typename... KernelArgs>
+void run_kernel_row_reduction_cached(std::shared_ptr<const OmpExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, size_type result_stride,
+                                     dim<2> size, Array<char>& tmp,
+                                     KernelArgs&&... args)
 {
     run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result,
-                                  result_stride, size, map_to_device(args)...);
+                                  result_stride, size, tmp,
+                                  map_to_device(args)...);
 }
 
 
 template <typename ValueType, typename KernelFunction, typename ReductionOp,
           typename FinalizeOp, typename... KernelArgs>
-void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
-                              KernelFunction fn, ReductionOp op,
-                              FinalizeOp finalize, ValueType identity,
-                              ValueType* result, dim<2> size,
-                              KernelArgs&&... args)
+void run_kernel_col_reduction_cached(std::shared_ptr<const OmpExecutor> exec,
+                                     KernelFunction fn, ReductionOp op,
+                                     FinalizeOp finalize, ValueType identity,
+                                     ValueType* result, dim<2> size,
+                                     Array<char>& tmp, KernelArgs&&... args)
 {
     constexpr auto block_size = 8;
     using remainders = syn::as_list<syn::range<0, block_size, 1>>;
@@ -396,7 +409,7 @@ void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
         syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        finalize, identity, result, size, tmp, map_to_device(args)...);
 }
 
 
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 6a7e3fa7b01..dd3a290b342 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -68,45 +68,6 @@ namespace omp {
 namespace dense {
 
 
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result)
-{
-    // OpenMP uses the unified kernel.
-    compute_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result)
-{
-    compute_conj_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result)
-{
-    compute_norm2(exec, x, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp
index fbc14a1ec79..5666e5cfd3f 100644
--- a/omp/test/base/kernel_launch.cpp
+++ b/omp/test/base/kernel_launch.cpp
@@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 
 
-namespace {
-
-
 using gko::dim;
 using gko::int64;
 using gko::size_type;
 using std::is_same;
 
 
+struct move_only_type {
+    move_only_type() {}
+
+    move_only_type(move_only_type&&) {}
+
+    move_only_type(const move_only_type&) = delete;
+};
+
+
+move_only_type move_only_val{};
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+template <>
+struct to_device_type_impl<move_only_type&> {
+    using type = int64;
+
+    static type map_to_device(move_only_type&) { return 0; }
+};
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
+
+
 class KernelLaunch : public ::testing::Test {
 protected:
     KernelLaunch()
@@ -97,12 +124,13 @@ TEST_F(KernelLaunch, Runs1D)
 {
     gko::kernels::omp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d) {
+        [] GKO_KERNEL(auto i, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i] = i;
         },
-        zero_array.get_num_elems(), zero_array.get_data());
+        zero_array.get_num_elems(), zero_array.get_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_array);
 }
@@ -112,17 +140,19 @@ TEST_F(KernelLaunch, Runs1DArray)
 {
     gko::kernels::omp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i] = i;
             } else {
                 d[i] = 0;
             }
         },
-        zero_array.get_num_elems(), zero_array, zero_array.get_const_data());
+        zero_array.get_num_elems(), zero_array, zero_array.get_const_data(),
+        move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_array);
 }
@@ -132,13 +162,14 @@ TEST_F(KernelLaunch, Runs1DDense)
 {
     gko::kernels::omp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
                           "type");
             static_assert(is_same<decltype(d_ptr), const double*>::value,
                           "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr;
             bool strides_correct = d.stride == 5 && d2.stride == 5;
             bool accessors_2d_correct =
@@ -154,7 +185,7 @@ TEST_F(KernelLaunch, Runs1DDense)
         },
         16, zero_dense2.get(),
         static_cast<const gko::matrix::Dense<>*>(zero_dense2.get()),
-        zero_dense2->get_const_values());
+        zero_dense2->get_const_values(), move_only_val);
 
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
@@ -164,13 +195,14 @@ TEST_F(KernelLaunch, Runs2D)
 {
     gko::kernels::omp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             d[i + 4 * j] = 4 * i + j;
         },
-        dim<2>{4, 4}, zero_array.get_data());
+        dim<2>{4, 4}, zero_array.get_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array);
 }
@@ -180,18 +212,19 @@ TEST_F(KernelLaunch, Runs2DArray)
 {
     gko::kernels::omp::run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) {
+        [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(d), int*>::value, "type");
             static_assert(is_same<decltype(d_ptr), const int*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             if (d == d_ptr) {
                 d[i + 4 * j] = 4 * i + j;
             } else {
                 d[i + 4 * j] = 0;
             }
         },
-        dim<2>{4, 4}, zero_array, zero_array.get_const_data());
+        dim<2>{4, 4}, zero_array, zero_array.get_const_data(), move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array);
 }
@@ -202,7 +235,7 @@ TEST_F(KernelLaunch, Runs2DDense)
     gko::kernels::omp::run_kernel_solver(
         exec,
         [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3,
-                      auto d4, auto d2_ptr, auto d3_ptr) {
+                      auto d4, auto d2_ptr, auto d3_ptr, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(d(0, 0)), double&>::value, "type");
             static_assert(is_same<decltype(d2(0, 0)), const double&>::value,
@@ -213,6 +246,7 @@ TEST_F(KernelLaunch, Runs2DDense)
             static_assert(is_same<decltype(d4), double*>::value, "type");
             static_assert(is_same<decltype(d2_ptr), double*>::value, "type");
             static_assert(is_same<decltype(d3_ptr), double*>::value, "type");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             bool pointers_correct = d.data == d_ptr && d2.data == d_ptr &&
                                     d3.data == d2_ptr && d4 == d3_ptr;
             bool strides_correct =
@@ -235,7 +269,7 @@ TEST_F(KernelLaunch, Runs2DDense)
         zero_dense2->get_const_values(),
         gko::kernels::omp::default_stride(zero_dense.get()),
         gko::kernels::omp::row_vector(vec_dense.get()),
-        zero_dense->get_values(), vec_dense->get_values());
+        zero_dense->get_values(), vec_dense->get_values(), move_only_val);
 
     GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0);
 }
@@ -246,28 +280,30 @@ TEST_F(KernelLaunch, Reduction1D)
 
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{100000}, output);
+        size_type{100000}, output, move_only_val);
 
     // 2 * sum i=0...99999 (i+1)
     ASSERT_EQ(*output.get_const_data(), 10000100000LL);
 
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto a) {
+        [] GKO_KERNEL(auto i, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return i + 1;
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(),
-        size_type{10}, output);
+        size_type{10}, output, move_only_val);
 
     // 2 * sum i=0...9 (i+1)
     ASSERT_EQ(*output.get_const_data(), 110LL);
@@ -281,14 +317,16 @@ TEST_F(KernelLaunch, Reduction2DSmallRows)
     for (int cols = 0; cols < 17; cols++) {
         gko::kernels::omp::run_kernel_reduction(
             exec,
-            [] GKO_KERNEL(auto i, auto j, auto a) {
+            [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
                 static_assert(is_same<decltype(a), int64*>::value, "value");
+                static_assert(is_same<decltype(dummy), int64>::value, "dummy");
                 return (i + 1) * (j + 1);
             },
             [] GKO_KERNEL(auto i, auto j) { return i + j; },
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-            gko::dim<2>{10, cols}, output);
+            gko::dim<2>{10, cols}, output, move_only_val);
 
         // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1)
         ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1));
@@ -303,14 +341,16 @@ TEST_F(KernelLaunch, Reduction2DLargeRows)
     for (int cols = 0; cols < 17; cols++) {
         gko::kernels::omp::run_kernel_reduction(
             exec,
-            [] GKO_KERNEL(auto i, auto j, auto a) {
+            [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                 static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
                 static_assert(is_same<decltype(a), int64*>::value, "value");
+                static_assert(is_same<decltype(dummy), int64>::value, "dummy");
                 return (i + 1) * (j + 1);
             },
             [] GKO_KERNEL(auto i, auto j) { return i + j; },
             [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-            gko::dim<2>{1000, cols}, output);
+            gko::dim<2>{1000, cols}, output, move_only_val);
 
         // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1)
         ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1));
@@ -324,14 +364,15 @@ TEST_F(KernelLaunch, Reduction2D)
 
     gko::kernels::omp::run_kernel_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(),
-        gko::dim<2>{1000, 100}, output);
+        gko::dim<2>{1000, 100}, output, move_only_val);
 
 
     // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1)
@@ -358,16 +399,18 @@ TEST_F(KernelLaunch, ReductionRow2DSmall)
 
     gko::kernels::omp::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
+            static_assert(is_same<decltype(j), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
                     static_cast<size_type>(num_cols)},
-        output);
+        output, move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -390,16 +433,17 @@ TEST_F(KernelLaunch, ReductionRow2D)
 
     gko::kernels::omp::run_kernel_row_reduction(
         exec,
-        [] GKO_KERNEL(auto i, auto j, auto a) {
+        [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
             static_assert(is_same<decltype(i), int64>::value, "index");
             static_assert(is_same<decltype(a), int64*>::value, "value");
+            static_assert(is_same<decltype(dummy), int64>::value, "dummy");
             return (i + 1) * (j + 1);
         },
         [] GKO_KERNEL(auto i, auto j) { return i + j; },
         [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2,
         gko::dim<2>{static_cast<size_type>(num_rows),
                     static_cast<size_type>(num_cols)},
-        output);
+        output, move_only_val);
 
     GKO_ASSERT_ARRAY_EQ(host_ref, output);
 }
@@ -422,9 +466,12 @@ TEST_F(KernelLaunch, ReductionCol2D)
 
             gko::kernels::omp::run_kernel_col_reduction(
                 exec,
-                [] GKO_KERNEL(auto i, auto j, auto a) {
+                [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) {
                     static_assert(is_same<decltype(i), int64>::value, "index");
+                    static_assert(is_same<decltype(j), int64>::value, "index");
                     static_assert(is_same<decltype(a), int64*>::value, "value");
+                    static_assert(is_same<decltype(dummy), int64>::value,
+                                  "dummy");
                     return (i + 1) * (j + 1);
                 },
                 [] GKO_KERNEL(auto i, auto j) { return i + j; },
@@ -432,12 +479,9 @@ TEST_F(KernelLaunch, ReductionCol2D)
                 output.get_data(),
                 gko::dim<2>{static_cast<size_type>(num_rows),
                             static_cast<size_type>(num_cols)},
-                output);
+                output, move_only_val);
 
             GKO_ASSERT_ARRAY_EQ(host_ref, output);
         }
     }
 }
-
-
-}  // namespace
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 1bf28847374..648bc8a6e7e 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -282,7 +282,7 @@ template <typename ValueType>
 void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
                  const matrix::Dense<ValueType>* x,
                  const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
+                 matrix::Dense<ValueType>* result, Array<char>&)
 {
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<ValueType>();
@@ -297,24 +297,11 @@ void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
-template <typename ValueType>
-void compute_dot_dispatch(std::shared_ptr<const ReferenceExecutor> exec,
-                          const matrix::Dense<ValueType>* x,
-                          const matrix::Dense<ValueType>* y,
-                          matrix::Dense<ValueType>* result)
-{
-    compute_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
                       const matrix::Dense<ValueType>* x,
                       const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
+                      matrix::Dense<ValueType>* result, Array<char>&)
 {
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<ValueType>();
@@ -329,23 +316,11 @@ void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
 
-template <typename ValueType>
-void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                               const matrix::Dense<ValueType>* x,
-                               const matrix::Dense<ValueType>* y,
-                               matrix::Dense<ValueType>* result)
-{
-    compute_conj_dot(exec, x, y, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
+                   matrix::Dense<remove_complex<ValueType>>* result,
+                   Array<char>&)
 {
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<remove_complex<ValueType>>();
@@ -363,22 +338,11 @@ void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
-template <typename ValueType>
-void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* x,
-                            matrix::Dense<remove_complex<ValueType>>* result)
-{
-    compute_norm2(exec, x, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
-
-
 template <typename ValueType>
 void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
+                   matrix::Dense<remove_complex<ValueType>>* result,
+                   Array<char>&)
 {
     for (size_type j = 0; j < x->get_size()[1]; ++j) {
         result->at(0, j) = zero<remove_complex<ValueType>>();
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 5a118feb36a..b843f3ec6a0 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -998,18 +998,135 @@ TEST_F(Dense, ExtractDiagonalOnShortFatIntoDenseCrossExecutor)
 }
 
 
+TEST_F(Dense, ComputeDotIsEquivalentToRef)
+{
+    set_up_vector_data(2);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_dot(y.get(), dot_expected.get());
+    dx->compute_dot(y.get(), ddot.get());
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef)
+{
+    set_up_vector_data(40);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+    gko::Array<char> tmp{ref};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_dot(y.get(), dot_expected.get(), tmp);
+    dx->compute_dot(y.get(), ddot.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeConjDotIsEquivalentToRef)
+{
+    set_up_vector_data(13);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_conj_dot(y.get(), dot_expected.get());
+    dx->compute_conj_dot(y.get(), ddot.get());
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef)
+{
+    set_up_vector_data(65);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+    gko::Array<char> tmp{ref};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_conj_dot(y.get(), dot_expected.get(), tmp);
+    dx->compute_conj_dot(y.get(), ddot.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
 TEST_F(Dense, ComputeNorm1IsEquivalentToRef)
 {
-    set_up_apply_data();
+    set_up_vector_data(2);
 
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
     auto norm_expected = NormVector::create(ref, norm_size);
     auto dnorm = NormVector::create(ref, norm_size);
 
+    // all parameters are on ref to check cross-executor calls
     x->compute_norm1(norm_expected.get());
     dx->compute_norm1(dnorm.get());
 
-    GKO_ASSERT_MTX_NEAR(x, dx, r<vtype>::value);
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef)
+{
+    set_up_vector_data(10);
+
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(ref, norm_size);
+    gko::Array<char> tmp{ref};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_norm1(norm_expected.get(), tmp);
+    dx->compute_norm1(dnorm.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(5);
+
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(ref, norm_size);
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_norm1(norm_expected.get());
+    dx->compute_norm1(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef)
+{
+    set_up_vector_data(3);
+
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(ref, norm_size);
+    gko::Array<char> tmp{ref};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_norm1(norm_expected.get(), tmp);
+    dx->compute_norm1(dnorm.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
 }
 
 

From ee00f4968a39a9383de4b0328c115a1b97c813d8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 29 Mar 2022 19:31:07 +0200
Subject: [PATCH 3/5] restore reduction dispatch kernels for vendor BLAS

---
 core/device_hooks/common_kernels.inc.cpp |  3 +
 core/matrix/dense_kernels.hpp            | 24 ++++++++
 cuda/matrix/dense_kernels.cu             | 72 ++++++++++++++++++++++++
 dpcpp/matrix/dense_kernels.dp.cpp        | 43 ++++++++++++++
 hip/matrix/dense_kernels.hip.cpp         | 72 ++++++++++++++++++++++++
 omp/matrix/dense_kernels.cpp             | 41 ++++++++++++++
 reference/matrix/dense_kernels.cpp       | 40 +++++++++++++
 7 files changed, 295 insertions(+)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 13dae15583f..a89d1f3c140 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -256,8 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 1ef142eb4d7..3cf15052826 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -104,12 +104,24 @@ namespace kernels {
                          const matrix::Diagonal<_type>* x,            \
                          matrix::Dense<_type>* y)
 
+#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type)               \
+    void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
+                              const matrix::Dense<_type>* x,               \
+                              const matrix::Dense<_type>* y,               \
+                              matrix::Dense<_type>* result, Array<char>& tmp)
+
 #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type)               \
     void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
                      const matrix::Dense<_type>* x,               \
                      const matrix::Dense<_type>* y,               \
                      matrix::Dense<_type>* result, Array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type)     \
+    void compute_conj_dot_dispatch(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                  \
+        const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \
+        matrix::Dense<_type>* result, Array<char>& tmp)
+
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
@@ -122,6 +134,12 @@ namespace kernels {
                        matrix::Dense<remove_complex<_type>>* result, \
                        Array<char>& tmp)
 
+#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type)                \
+    void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,  \
+                                const matrix::Dense<_type>* x,                \
+                                matrix::Dense<remove_complex<_type>>* result, \
+                                Array<char>& tmp)
+
 #define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type)                \
     void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,  \
                        const matrix::Dense<_type>* x,                \
@@ -310,10 +328,16 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);               \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);          \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 063b0f069d3..7eeebd82357 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -73,6 +73,78 @@ constexpr int default_block_size = 512;
 #include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
 
 
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), y->get_const_values(), y->get_stride(),
+                        result->get_values());
+        } else {
+            compute_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                             x->get_stride(), y->get_const_values(),
+                             y->get_stride(), result->get_values());
+        } else {
+            compute_conj_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                          x->get_stride(), result->get_values());
+        } else {
+            compute_norm2(exec, x, result, tmp);
+        }
+    } else {
+        compute_norm2(exec, x, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 39c2b0c54e4..0503b490e47 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -189,6 +189,49 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose,
 }  // namespace kernel
 
 
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, Array<char>& tmp)
+{
+    // TODO Add onemkl for single column ?
+    compute_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               Array<char>& tmp)
+{
+    // TODO Add onemkl for single column ?
+    compute_conj_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            Array<char>& tmp)
+{
+    // TODO Add onemkl for single column ?
+    compute_norm2(exec, x, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index d193af9a4fb..0f1d2c455da 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -76,6 +76,78 @@ constexpr int default_block_size = 512;
 #include "common/cuda_hip/matrix/dense_kernels.hpp.inc"
 
 
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
+            hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
+                         x->get_stride(), y->get_const_values(),
+                         y->get_stride(), result->get_values());
+        } else {
+            compute_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
+            hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                              x->get_stride(), y->get_const_values(),
+                              y->get_stride(), result->get_values());
+        } else {
+            compute_conj_dot(exec, x, y, result, tmp);
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            Array<char>& tmp)
+{
+    if (x->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
+            hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), result->get_values());
+        } else {
+            compute_norm2(exec, x, result, tmp);
+        }
+    } else {
+        compute_norm2(exec, x, result, tmp);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index dd3a290b342..e09c54b2522 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -68,6 +68,47 @@ namespace omp {
 namespace dense {
 
 
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, Array<char>& tmp)
+{
+    // OpenMP uses the unified kernel.
+    compute_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               Array<char>& tmp)
+{
+    compute_conj_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            Array<char>& tmp)
+{
+    compute_norm2(exec, x, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 648bc8a6e7e..317b3599ae3 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -297,6 +297,19 @@ void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
+template <typename ValueType>
+void compute_dot_dispatch(std::shared_ptr<const ReferenceExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result, Array<char>& tmp)
+{
+    compute_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
                       const matrix::Dense<ValueType>* x,
@@ -316,6 +329,20 @@ void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result,
+                               Array<char>& tmp)
+{
+    compute_conj_dot(exec, x, y, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,
@@ -338,6 +365,19 @@ void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result,
+                            Array<char>& tmp)
+{
+    compute_norm2(exec, x, result, tmp);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,

From e71958ece0e574c043bed911c6f2229bb158930a Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 31 Mar 2022 14:35:37 +0200
Subject: [PATCH 4/5] review updates

* handle executor of tmp storage in core, not device
* add missing documentation

Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 .../base/kernel_launch_reduction.hpp.inc      |  5 --
 core/matrix/dense.cpp                         | 16 ++++
 dpcpp/base/kernel_launch_reduction.dp.hpp     |  5 --
 include/ginkgo/core/matrix/dense.hpp          | 16 +++-
 omp/base/kernel_launch_reduction.hpp          |  4 -
 test/matrix/dense_kernels.cpp                 | 80 +++++++++++++++++--
 6 files changed, 104 insertions(+), 22 deletions(-)

diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
index 4ecf9c9979d..e32312e4eb4 100644
--- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
+++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc
@@ -130,7 +130,6 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
         ceildiv(size, block_size), exec->get_num_warps() * oversubscription);
     if (num_blocks > 1) {
         const auto required_storage = sizeof(ValueType) * num_blocks;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -171,7 +170,6 @@ void run_kernel_reduction_cached(std::shared_ptr<const DefaultExecutor> exec,
                         exec->get_num_warps() * oversubscription);
     if (num_blocks > 1) {
         const auto required_storage = sizeof(ValueType) * num_blocks;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -395,7 +393,6 @@ void run_generic_col_reduction_small(
                                         as_device_type(result), args...);
     } else {
         const auto required_storage = sizeof(ValueType) * num_blocks * cols;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -441,7 +438,6 @@ void run_kernel_row_reduction_cached(
     if (rows * cols > resources && rows < cols) {
         const auto col_blocks = ceildiv(rows * cols, resources);
         const auto required_storage = sizeof(ValueType) * col_blocks * rows;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -512,7 +508,6 @@ void run_kernel_col_reduction_cached(
                 as_device_type(result), map_to_device(args)...);
         } else {
             const auto required_storage = sizeof(ValueType) * row_blocks * cols;
-            tmp.set_executor(exec);
             if (tmp.get_num_elems() < required_storage) {
                 tmp.resize_and_reset(required_storage);
             }
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index afa0de1f0e9..611925231cc 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -341,6 +341,10 @@ void Dense<ValueType>::compute_dot(const LinOp* b, LinOp* result,
     GKO_ASSERT_EQUAL_DIMENSIONS(this, b);
     GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
     auto exec = this->get_executor();
+    if (tmp.get_executor() != exec) {
+        tmp.clear();
+        tmp.set_executor(exec);
+    }
     auto local_b = make_temporary_clone(exec, b);
     auto local_res = make_temporary_clone(exec, result);
     auto dense_b = make_temporary_conversion<ValueType>(local_b.get());
@@ -371,6 +375,10 @@ void Dense<ValueType>::compute_conj_dot(const LinOp* b, LinOp* result,
     GKO_ASSERT_EQUAL_DIMENSIONS(this, b);
     GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
     auto exec = this->get_executor();
+    if (tmp.get_executor() != exec) {
+        tmp.clear();
+        tmp.set_executor(exec);
+    }
     auto local_b = make_temporary_clone(exec, b);
     auto local_res = make_temporary_clone(exec, result);
     auto dense_b = make_temporary_conversion<ValueType>(local_b.get());
@@ -400,6 +408,10 @@ void Dense<ValueType>::compute_norm2(LinOp* result, Array<char>& tmp) const
 {
     GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
     auto exec = this->get_executor();
+    if (tmp.get_executor() != exec) {
+        tmp.clear();
+        tmp.set_executor(exec);
+    }
     auto local_result = make_temporary_clone(exec, result);
     auto dense_res = make_temporary_conversion<remove_complex<ValueType>>(
         local_result.get());
@@ -424,6 +436,10 @@ void Dense<ValueType>::compute_norm1(LinOp* result, Array<char>& tmp) const
 {
     GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
     auto exec = this->get_executor();
+    if (tmp.get_executor() != exec) {
+        tmp.clear();
+        tmp.set_executor(exec);
+    }
     auto local_result = make_temporary_clone(exec, result);
     auto dense_res = make_temporary_conversion<remove_complex<ValueType>>(
         local_result.get());
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 330e5aa7f44..601527c6cae 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -179,7 +179,6 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
     auto queue = exec->get_queue();
     if (num_workgroups > 1) {
         const auto required_storage = sizeof(ValueType) * num_workgroups;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -227,7 +226,6 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
     auto queue = exec->get_queue();
     if (num_workgroups > 1) {
         const auto required_storage = sizeof(ValueType) * num_workgroups;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -531,7 +529,6 @@ void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
         });
     } else {
         const auto required_storage = sizeof(ValueType) * row_blocks * cols;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -578,7 +575,6 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     if (rows * cols > resources && rows < cols) {
         const auto col_blocks = ceildiv(rows * cols, resources);
         const auto required_storage = sizeof(ValueType) * col_blocks * rows;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -651,7 +647,6 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
             });
         } else {
             const auto required_storage = sizeof(ValueType) * row_blocks * cols;
-            tmp.set_executor(exec);
             if (tmp.get_num_elems() < required_storage) {
                 tmp.resize_and_reset(required_storage);
             }
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 210c67941a8..bcaba840fdf 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -747,6 +747,9 @@ class Dense
      * @param result  a Dense row vector, used to store the dot product
      *                (the number of column in the vector must match the number
      *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
      */
     void compute_dot(const LinOp* b, LinOp* result, Array<char>& tmp) const;
 
@@ -767,6 +770,9 @@ class Dense
      * @param result  a Dense row vector, used to store the dot product
      *                (the number of column in the vector must match the number
      *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
      */
     void compute_conj_dot(const LinOp* b, LinOp* result,
                           Array<char>& tmp) const;
@@ -785,7 +791,10 @@ class Dense
      *
      * @param result  a Dense row vector, used to store the norm
      *                (the number of columns in the vector must match the
-     * number of columns of this)
+     *                number of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
      */
     void compute_norm2(LinOp* result, Array<char>& tmp) const;
 
@@ -803,7 +812,10 @@ class Dense
      *
      * @param result  a Dense row vector, used to store the norm
      *                (the number of columns in the vector must match the
-     * number of columns of this)
+     *                number of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
      */
     void compute_norm1(LinOp* result, Array<char>& tmp) const;
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 55046cae832..c95a9caa5de 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -66,7 +66,6 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     const auto ssize = static_cast<int64>(size);
     const auto work_per_thread = ceildiv(ssize, num_threads);
     const auto required_storage = sizeof(ValueType) * num_threads;
-    tmp.set_executor(exec);
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
     }
@@ -103,7 +102,6 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
     const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto work_per_thread = ceildiv(rows, num_threads);
     const auto required_storage = sizeof(ValueType) * num_threads;
-    tmp.set_executor(exec);
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
     }
@@ -233,7 +231,6 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         // small number of rows and large reduction sizes: do partial sum first
         const auto work_per_thread = ceildiv(cols, num_threads);
         const auto required_storage = sizeof(ValueType) * rows * num_threads;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -331,7 +328,6 @@ void run_kernel_col_reduction_sized_impl(
             ceildiv(reduction_kernel_oversubscription * num_threads, cols);
         const auto rows_per_thread = ceildiv(rows, reduction_size);
         const auto required_storage = sizeof(ValueType) * rows * reduction_size;
-        tmp.set_executor(exec);
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index b843f3ec6a0..5ea753c225a 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -1014,6 +1014,23 @@ TEST_F(Dense, ComputeDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ComputeDotWithPreallocatedTmpIsEquivalentToRef)
+{
+    set_up_vector_data(42);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+    gko::Array<char> tmp{exec, 12345};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_dot(y.get(), dot_expected.get());
+    dx->compute_dot(y.get(), ddot.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
 TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef)
 {
     set_up_vector_data(40);
@@ -1021,10 +1038,10 @@ TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef)
     auto dot_size = gko::dim<2>{1, x->get_size()[1]};
     auto dot_expected = Mtx::create(ref, dot_size);
     auto ddot = Mtx::create(ref, dot_size);
-    gko::Array<char> tmp{ref};
+    gko::Array<char> tmp{exec};
 
     // all parameters are on ref to check cross-executor calls
-    x->compute_dot(y.get(), dot_expected.get(), tmp);
+    x->compute_dot(y.get(), dot_expected.get());
     dx->compute_dot(y.get(), ddot.get(), tmp);
 
     GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
@@ -1047,6 +1064,23 @@ TEST_F(Dense, ComputeConjDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ComputeConjDotWithPreallocatedTmpIsEquivalentToRef)
+{
+    set_up_vector_data(36);
+
+    auto dot_size = gko::dim<2>{1, x->get_size()[1]};
+    auto dot_expected = Mtx::create(ref, dot_size);
+    auto ddot = Mtx::create(ref, dot_size);
+    gko::Array<char> tmp{exec, 12345};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_conj_dot(y.get(), dot_expected.get());
+    dx->compute_conj_dot(y.get(), ddot.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
+}
+
+
 TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef)
 {
     set_up_vector_data(65);
@@ -1057,7 +1091,7 @@ TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef)
     gko::Array<char> tmp{ref};
 
     // all parameters are on ref to check cross-executor calls
-    x->compute_conj_dot(y.get(), dot_expected.get(), tmp);
+    x->compute_conj_dot(y.get(), dot_expected.get());
     dx->compute_conj_dot(y.get(), ddot.get(), tmp);
 
     GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r<vtype>::value);
@@ -1080,6 +1114,23 @@ TEST_F(Dense, ComputeNorm1IsEquivalentToRef)
 }
 
 
+TEST_F(Dense, ComputeNorm1WithPreallocatedTmpIsEquivalentToRef)
+{
+    set_up_vector_data(7);
+
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(ref, norm_size);
+    gko::Array<char> tmp{exec, 12345};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_norm1(norm_expected.get());
+    dx->compute_norm1(dnorm.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
 TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef)
 {
     set_up_vector_data(10);
@@ -1090,7 +1141,7 @@ TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef)
     gko::Array<char> tmp{ref};
 
     // all parameters are on ref to check cross-executor calls
-    x->compute_norm1(norm_expected.get(), tmp);
+    x->compute_norm1(norm_expected.get());
     dx->compute_norm1(dnorm.get(), tmp);
 
     GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
@@ -1113,7 +1164,7 @@ TEST_F(Dense, ComputeNorm2IsEquivalentToRef)
 }
 
 
-TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef)
+TEST_F(Dense, ComputeNorm2WithPreallocatedTmpIsEquivalentToRef)
 {
     set_up_vector_data(3);
 
@@ -1123,7 +1174,24 @@ TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef)
     gko::Array<char> tmp{ref};
 
     // all parameters are on ref to check cross-executor calls
-    x->compute_norm1(norm_expected.get(), tmp);
+    x->compute_norm1(norm_expected.get());
+    dx->compute_norm1(dnorm.get(), tmp);
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef)
+{
+    set_up_vector_data(14);
+
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(ref, norm_size);
+    gko::Array<char> tmp{exec, 12345};
+
+    // all parameters are on ref to check cross-executor calls
+    x->compute_norm1(norm_expected.get());
     dx->compute_norm1(dnorm.get(), tmp);
 
     GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);

From c6fe8561dd2919bb6fb10b849286af9d99a856a4 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 7 Apr 2022 11:25:56 +0200
Subject: [PATCH 5/5] review updates

* use *_dispatch reduction kernels in core and GMRES
* test vendor kernels

Co-authored-by: Yuhsiang M. Tsai <yhmtsai@gmail.com>
Co-authored-by: Pratik Nayak <pratik.nayak@kit.edu>
---
 core/matrix/dense.cpp                | 6 +++---
 cuda/solver/cb_gmres_kernels.cu      | 3 ++-
 cuda/solver/gmres_kernels.cu         | 3 ++-
 dpcpp/solver/cb_gmres_kernels.dp.cpp | 3 ++-
 dpcpp/solver/gmres_kernels.dp.cpp    | 3 ++-
 hip/solver/cb_gmres_kernels.hip.cpp  | 3 ++-
 hip/solver/gmres_kernels.hip.cpp     | 3 ++-
 test/matrix/dense_kernels.cpp        | 6 +++---
 8 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 611925231cc..b2a92cbbe95 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -76,9 +76,9 @@ GKO_REGISTER_OPERATION(add_scaled, dense::add_scaled);
 GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
 GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
 GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
-GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
-GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
-GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
+GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch);
+GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch);
+GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
 GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu
index 7bee0750390..0abdb07db28 100644
--- a/cuda/solver/cb_gmres_kernels.cu
+++ b/cuda/solver/cb_gmres_kernels.cu
@@ -144,7 +144,8 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
         acc::as_cuda_range(krylov_bases),
         as_cuda_type(residual_norm_collection->get_values()),
         residual_norm_collection->get_stride());
-    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                 tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu
index d228b47dd37..dc3d5ef93a8 100644
--- a/cuda/solver/gmres_kernels.cu
+++ b/cuda/solver/gmres_kernels.cu
@@ -119,7 +119,8 @@ void initialize_2(std::shared_ptr<const CudaExecutor> exec,
     constexpr auto block_size = default_block_size;
     Array<char> tmp{exec};
 
-    kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                 tmp);
 
     const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size);
     initialize_2_2_kernel<block_size><<<grid_dim_2, block_dim>>>(
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index 5e29c7aa612..2f048e1828f 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -1013,7 +1013,8 @@ void initialize_2(std::shared_ptr<const DpcppExecutor> exec,
         residual->get_size()[1], krylov_dim, krylov_bases,
         residual_norm_collection->get_values(),
         residual_norm_collection->get_stride());
-    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                  tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp
index 5c68e1580d5..608b956ba15 100644
--- a/dpcpp/solver/gmres_kernels.dp.cpp
+++ b/dpcpp/solver/gmres_kernels.dp.cpp
@@ -472,7 +472,8 @@ void initialize_2(std::shared_ptr<const DpcppExecutor> exec,
     constexpr auto block_size = default_block_size;
     Array<char> tmp{exec};
 
-    kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                  tmp);
 
     const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1,
                           1);
diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp
index 09f206d355b..b86c8bdc34a 100644
--- a/hip/solver/cb_gmres_kernels.hip.cpp
+++ b/hip/solver/cb_gmres_kernels.hip.cpp
@@ -146,7 +146,8 @@ void initialize_2(std::shared_ptr<const HipExecutor> exec,
                        krylov_dim, acc::as_hip_range(krylov_bases),
                        as_hip_type(residual_norm_collection->get_values()),
                        residual_norm_collection->get_stride());
-    kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                tmp);
 
     if (use_scalar) {
         components::fill_array(exec,
diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp
index 6c4c405f0de..4272eadbc12 100644
--- a/hip/solver/gmres_kernels.hip.cpp
+++ b/hip/solver/gmres_kernels.hip.cpp
@@ -123,7 +123,8 @@ void initialize_2(std::shared_ptr<const HipExecutor> exec,
     constexpr auto block_size = default_block_size;
     Array<char> tmp{exec};
 
-    kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp);
+    kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm,
+                                                tmp);
 
     const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size);
     hipLaunchKernelGGL(
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 5ea753c225a..eba5be5fd9d 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -1000,7 +1000,7 @@ TEST_F(Dense, ExtractDiagonalOnShortFatIntoDenseCrossExecutor)
 
 TEST_F(Dense, ComputeDotIsEquivalentToRef)
 {
-    set_up_vector_data(2);
+    set_up_vector_data(1);
 
     auto dot_size = gko::dim<2>{1, x->get_size()[1]};
     auto dot_expected = Mtx::create(ref, dot_size);
@@ -1050,7 +1050,7 @@ TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef)
 
 TEST_F(Dense, ComputeConjDotIsEquivalentToRef)
 {
-    set_up_vector_data(13);
+    set_up_vector_data(1);
 
     auto dot_size = gko::dim<2>{1, x->get_size()[1]};
     auto dot_expected = Mtx::create(ref, dot_size);
@@ -1150,7 +1150,7 @@ TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef)
 
 TEST_F(Dense, ComputeNorm2IsEquivalentToRef)
 {
-    set_up_vector_data(5);
+    set_up_vector_data(1);
 
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
     auto norm_expected = NormVector::create(ref, norm_size);